diff options
Diffstat (limited to 'shard')
-rwxr-xr-x | shard | 81 |
1 files changed, 81 insertions, 0 deletions
@@ -0,0 +1,81 @@ +#!/usr/bin/env ruby + +require 'trollop' + + +def make_shards(input, refs, alignments, output_prefix, num_shards=2, rand=false) + lc = `wc -l #{input}`.split.first.to_i + input_ext = input.split('.').last + refs_ext = refs.split('.').last + index = (0..lc-1).to_a + index.reverse! + index.shuffle! if rand + shard_sz = lc / num_shards + leftover = lc % num_shards + in_f = File.new input, 'r' + in_lines = in_f.readlines + refs_f = File.new refs, 'r' + refs_lines = refs_f.readlines + a_f = File.new alignments, 'r' + a_lines = a_f.readlines + shard_in_files = [] + shard_refs_files = [] + shard_a_files = [] + in_fns = [] + refs_fns = [] + a_fns = [] + 0.upto(num_shards-1) { |shard| + in_fn = "#{output_prefix}.#{shard}.#{input_ext}" + shard_in = File.new in_fn, 'w+' + in_fns << in_fn + refs_fn = "#{output_prefix}.#{shard}.#{refs_ext}" + shard_refs = File.new refs_fn, 'w+' + refs_fns << refs_fn + a_fn = "#{output_prefix}.#{shard}.a" + shard_a = File.new a_fn, 'w+' + a_fns << a_fn + 0.upto(shard_sz-1) { |i| + j = index.pop + shard_in.write in_lines[j] + shard_refs.write refs_lines[j] + shard_a.write a_lines[j] + } + shard_in_files << shard_in + shard_refs_files << shard_refs + shard_a_files << shard_a + } + if !rand + while leftover > 0 + j = index.pop + shard_in_files[-1].write in_lines[j] + shard_refs_files[-1].write refs_lines[j] + shard_a_files[-1].write a_lines[j] + leftover -= 1 + end + else + 0.upto(num_shards-1) { |shard| + break if leftover <= 0 + j = index.pop + shard_in_files[shard].write in_lines[j] + shard_refs_files[shard].write refs_lines[j] + shard_a_files[shard].write a_lines[j] + leftover -= 1 + } + end + (shard_in_files + shard_refs_files).each do |f| f.close end + in_f.close + refs_f.close + return [in_fns, refs_fns] +end + +opts = Trollop::options do + opt :input, 'input', :type => :string + opt :references, 'references', :type => :string + opt :alignments, 'alignments', :type => :string + opt :output_prefix, 'output prefix', :type => :string + opt :randomize, 'randomize', :type => :bool, :default => false, :short => '-z' + opt :num_shards, 'number of shards', :type => :int +end + +make_shards(opts[:input], opts[:references], opts[:alignments], opts[:output_prefix], opts[:num_shards], opts[:randomize]) + |