From db6a6ecfa350cae29739c59df1210d8f76a479c9 Mon Sep 17 00:00:00 2001
From: Patrick Simianer <p@simianer.de>
Date: Thu, 5 Dec 2013 07:56:38 +0100
Subject: init

---
 shard | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)
 create mode 100755 shard

(limited to 'shard')

diff --git a/shard b/shard
new file mode 100755
index 0000000..7729699
--- /dev/null
+++ b/shard
@@ -0,0 +1,81 @@
+#!/usr/bin/env ruby
+
+require 'trollop'
+
+
+def make_shards(input, refs, alignments, output_prefix, num_shards=2, rand=false)
+  lc = `wc -l #{input}`.split.first.to_i
+  input_ext = input.split('.').last
+  refs_ext = refs.split('.').last
+  index = (0..lc-1).to_a
+  index.reverse!
+  index.shuffle! if rand
+  shard_sz = lc / num_shards
+  leftover = lc % num_shards
+  in_f = File.new input, 'r'
+  in_lines = in_f.readlines
+  refs_f = File.new refs, 'r'
+  refs_lines = refs_f.readlines
+  a_f = File.new alignments, 'r'
+  a_lines = a_f.readlines
+  shard_in_files = []
+  shard_refs_files = []
+  shard_a_files = []
+  in_fns = []
+  refs_fns = []
+  a_fns = []
+  0.upto(num_shards-1) { |shard|
+    in_fn = "#{output_prefix}.#{shard}.#{input_ext}"
+    shard_in = File.new in_fn, 'w+'
+    in_fns << in_fn
+    refs_fn = "#{output_prefix}.#{shard}.#{refs_ext}"
+    shard_refs = File.new refs_fn, 'w+'
+    refs_fns << refs_fn
+    a_fn = "#{output_prefix}.#{shard}.a"
+    shard_a = File.new a_fn, 'w+'
+    a_fns << a_fn
+    0.upto(shard_sz-1) { |i|
+      j = index.pop
+      shard_in.write in_lines[j]
+      shard_refs.write refs_lines[j]
+      shard_a.write a_lines[j]
+    }
+    shard_in_files << shard_in
+    shard_refs_files << shard_refs
+    shard_a_files << shard_a
+  }
+  if !rand
+    while leftover > 0
+      j = index.pop
+      shard_in_files[-1].write in_lines[j]
+      shard_refs_files[-1].write refs_lines[j]
+      shard_a_files[-1].write a_lines[j]
+      leftover -= 1
+    end
+  else
+    0.upto(num_shards-1) { |shard|
+      break if leftover <= 0
+      j = index.pop
+      shard_in_files[shard].write in_lines[j]
+      shard_refs_files[shard].write refs_lines[j]
+      shard_a_files[shard].write a_lines[j]
+      leftover -= 1
+    }
+  end
+  (shard_in_files + shard_refs_files).each do |f| f.close end
+  in_f.close
+  refs_f.close
+  return [in_fns, refs_fns]
+end
+
+opts = Trollop::options do
+  opt :input, 'input', :type => :string
+  opt :references, 'references', :type => :string
+  opt :alignments, 'alignments', :type => :string
+  opt :output_prefix, 'output prefix', :type => :string
+  opt :randomize, 'randomize', :type => :bool, :default => false, :short => '-z'
+  opt :num_shards, 'number of shards', :type => :int
+end
+
+make_shards(opts[:input], opts[:references], opts[:alignments], opts[:output_prefix], opts[:num_shards], opts[:randomize])
+
-- 
cgit v1.2.3