summaryrefslogtreecommitdiff
path: root/shard
diff options
context:
space:
mode:
Diffstat (limited to 'shard')
-rwxr-xr-xshard81
1 files changed, 81 insertions, 0 deletions
diff --git a/shard b/shard
new file mode 100755
index 0000000..7729699
--- /dev/null
+++ b/shard
@@ -0,0 +1,81 @@
+#!/usr/bin/env ruby
+
+require 'trollop'
+
+
+def make_shards(input, refs, alignments, output_prefix, num_shards=2, rand=false)
+ lc = `wc -l #{input}`.split.first.to_i
+ input_ext = input.split('.').last
+ refs_ext = refs.split('.').last
+ index = (0..lc-1).to_a
+ index.reverse!
+ index.shuffle! if rand
+ shard_sz = lc / num_shards
+ leftover = lc % num_shards
+ in_f = File.new input, 'r'
+ in_lines = in_f.readlines
+ refs_f = File.new refs, 'r'
+ refs_lines = refs_f.readlines
+ a_f = File.new alignments, 'r'
+ a_lines = a_f.readlines
+ shard_in_files = []
+ shard_refs_files = []
+ shard_a_files = []
+ in_fns = []
+ refs_fns = []
+ a_fns = []
+ 0.upto(num_shards-1) { |shard|
+ in_fn = "#{output_prefix}.#{shard}.#{input_ext}"
+ shard_in = File.new in_fn, 'w+'
+ in_fns << in_fn
+ refs_fn = "#{output_prefix}.#{shard}.#{refs_ext}"
+ shard_refs = File.new refs_fn, 'w+'
+ refs_fns << refs_fn
+ a_fn = "#{output_prefix}.#{shard}.a"
+ shard_a = File.new a_fn, 'w+'
+ a_fns << a_fn
+ 0.upto(shard_sz-1) { |i|
+ j = index.pop
+ shard_in.write in_lines[j]
+ shard_refs.write refs_lines[j]
+ shard_a.write a_lines[j]
+ }
+ shard_in_files << shard_in
+ shard_refs_files << shard_refs
+ shard_a_files << shard_a
+ }
+ if !rand
+ while leftover > 0
+ j = index.pop
+ shard_in_files[-1].write in_lines[j]
+ shard_refs_files[-1].write refs_lines[j]
+ shard_a_files[-1].write a_lines[j]
+ leftover -= 1
+ end
+ else
+ 0.upto(num_shards-1) { |shard|
+ break if leftover <= 0
+ j = index.pop
+ shard_in_files[shard].write in_lines[j]
+ shard_refs_files[shard].write refs_lines[j]
+ shard_a_files[shard].write a_lines[j]
+ leftover -= 1
+ }
+ end
+ (shard_in_files + shard_refs_files).each do |f| f.close end
+ in_f.close
+ refs_f.close
+ return [in_fns, refs_fns]
+end
+
+opts = Trollop::options do
+ opt :input, 'input', :type => :string
+ opt :references, 'references', :type => :string
+ opt :alignments, 'alignments', :type => :string
+ opt :output_prefix, 'output prefix', :type => :string
+ opt :randomize, 'randomize', :type => :bool, :default => false, :short => '-z'
+ opt :num_shards, 'number of shards', :type => :int
+end
+
+make_shards(opts[:input], opts[:references], opts[:alignments], opts[:output_prefix], opts[:num_shards], opts[:randomize])
+