summaryrefslogtreecommitdiff
path: root/shard
blob: 615512314ba336f3314e508ddced4a0b671a38c6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env ruby

require 'trollop'

def make_shards(input, refs, alignments, output_prefix, num_shards=2, rand=false)
  lc = `wc -l #{input}`.split.first.to_i
  input_ext = input.split('.').last
  refs_ext = refs.split('.').last
  index = (0..lc-1).to_a
  index.reverse!
  index.shuffle! if rand
  shard_sz = lc / num_shards
  leftover = lc % num_shards
  in_f = ReadFile.new input
  in_lines = in_f.readlines
  refs_f = ReadFile.new refs
  refs_lines = refs_f.readlines
  a_f = ReadFile.new alignments
  a_lines = a_f.readlines
  shard_in_files = []
  shard_refs_files = []
  shard_a_files = []
  in_fns = []
  refs_fns = []
  a_fns = []
  0.upto(num_shards-1) { |shard|
    in_fn = "#{output_prefix}.#{shard}.#{input_ext}"
    shard_in = WriteFile.new in_fn
    in_fns << in_fn
    refs_fn = "#{output_prefix}.#{shard}.#{refs_ext}"
    shard_refs = WriteFile.new refs_fn
    refs_fns << refs_fn
    a_fn = "#{output_prefix}.#{shard}.a"
    shard_a = WriteFile.new a_fn
    a_fns << a_fn
    0.upto(shard_sz-1) { |i|
      j = index.pop
      shard_in.write in_lines[j]
      shard_refs.write refs_lines[j]
      shard_a.write a_lines[j]
    }
    shard_in_files << shard_in
    shard_refs_files << shard_refs
    shard_a_files << shard_a
  }
  if !rand
    while leftover > 0
      j = index.pop
      shard_in_files[-1].write in_lines[j]
      shard_refs_files[-1].write refs_lines[j]
      shard_a_files[-1].write a_lines[j]
      leftover -= 1
    end
  else
    0.upto(num_shards-1) { |shard|
      break if leftover <= 0
      j = index.pop
      shard_in_files[shard].write in_lines[j]
      shard_refs_files[shard].write refs_lines[j]
      shard_a_files[shard].write a_lines[j]
      leftover -= 1
    }
  end
  (shard_in_files + shard_refs_files).each do |f| f.close end
  in_f.close
  refs_f.close
  return [in_fns, refs_fns]
end

opts = Trollop::options do
  opt :input, 'input', :type => :string, :required => true
  opt :references, 'references', :type => :string, :required => true
  opt :alignments, 'alignments', :type => :string, :required => true
  opt :output_prefix, 'output prefix', :type => :string, :required => true
  opt :randomize, 'randomize', :type => :bool, :default => false, :short => '-z'
  opt :num_shards, 'number of shards', :type => :int, :required => true
end

make_shards(opts[:input], opts[:references], opts[:alignments], opts[:output_prefix], opts[:num_shards], opts[:randomize])