diff options
-rwxr-xr-x | tsv-joint-set | 53 | ||||
-rwxr-xr-x | tsv-uniq | 49 |
2 files changed, 102 insertions, 0 deletions
diff --git a/tsv-joint-set b/tsv-joint-set new file mode 100755 index 0000000..c0dbdcf --- /dev/null +++ b/tsv-joint-set @@ -0,0 +1,53 @@ +#!/usr/bin/env ruby + +require 'set' +require 'zipf' +require 'optimist' + +conf = Optimist::options do + opt :n, "Desired number segments in test set.", :type => :int, :required => true + opt :tsv, ".tsv files", :type => :strings, :required => true + opt :fields, "Which field (0 or 1) to use for each file", :type => :ints, :required => true +end + +all = [] +all_sets = [] +conf[:tsv].each_with_index { |file,file_index| + fp = ReadFile.new file + a = [[],[]] + s = Set.new + while line = fp.gets + p0, p1 = line.strip.split "\t" + a[0] << p0 + a[1] << p1 + s << a[conf[:fields][file_index]].last + end + all << a + all_sets << s +} + +joint_set = all_sets.pop +all_sets.each { |set| + joint_set &= set +} +sample = joint_set.to_a.shuffle.take conf[:n] + +outputs = [] +all.each_with_index { |a,i| + o = [[],[]] + a[conf[:fields][i]].each_with_index { |segment,j| + if sample.include? segment + o[0] << a[0][j] + o[1] << a[1][j] + end + } + outputs << o +} + +outputs.each_with_index { |o,i| + f = WriteFile.new (conf[:tsv][i] + ".joint") + o[0].each_index { |j| + f.write o[0][j] + "\t" + o[1][j] + "\n" + } +} + diff --git a/tsv-uniq b/tsv-uniq new file mode 100755 index 0000000..fde79f2 --- /dev/null +++ b/tsv-uniq @@ -0,0 +1,49 @@ +#!/usr/bin/env ruby + +require 'set' + +strictness = ARGV[0].to_i # 1 one-side + # 2 just the pair + # 3 the pair and one side + +if strictness == 1 or strictness == 3 + side = ARGV[1].to_i # 0 or 1 +end + +segments = [[],[]] +while line = STDIN.gets + src, tgt = line.strip.split "\t" + segments[0] << src + segments[1] << tgt +end + +if strictness == 1 + seen = Set.new + segments[side].each_with_index { |segment,i| + if not seen.include? segment + puts "#{segments[i][0]}\t#{segments[i][1]}" + end + seen << segment + } +elsif strictness == 2 + seen = Set.new + segments[0].each_index { |i| + segment_pair = [segments[i][0], segments[i][1]] + if not seen.include? segment_pair + puts "#{segment_pair[0]}\t#{segment_pair[1]}" + end + seen << segment_pair + } +elsif strictness == 3 + seen = Set.new + seen_pairs = Set.new + segments[side].each_with_index { |segment,i| + segment_pair = [segments[0][i], segments[1][i]] + if not seen_pairs.include? segment_pair and not seen.include? segment + puts "#{segment_pair[0]}\t#{segment_pair[1]}" + end + seen << segment + seen_pairs << segment_pair + } +end + |