summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Simianer <patrick@lilt.com>2020-02-19 16:19:45 +0100
committerPatrick Simianer <patrick@lilt.com>2020-02-19 16:19:45 +0100
commit5178c4f31dd3b8eb1f1cba2b632863f8a92af029 (patch)
tree1b3b5d0f9f06f8cf658b081675a20a2354dbdf86
parent8c1c7c21f16e5800d615130831a8932a5d2b2bd2 (diff)
TSV utils
-rwxr-xr-xtsv-joint-set53
-rwxr-xr-xtsv-uniq49
2 files changed, 102 insertions, 0 deletions
diff --git a/tsv-joint-set b/tsv-joint-set
new file mode 100755
index 0000000..c0dbdcf
--- /dev/null
+++ b/tsv-joint-set
@@ -0,0 +1,53 @@
+#!/usr/bin/env ruby
+
+require 'set'
+require 'zipf'
+require 'optimist'
+
+conf = Optimist::options do
+ opt :n, "Desired number segments in test set.", :type => :int, :required => true
+ opt :tsv, ".tsv files", :type => :strings, :required => true
+ opt :fields, "Which field (0 or 1) to use for each file", :type => :ints, :required => true
+end
+
+all = []
+all_sets = []
+conf[:tsv].each_with_index { |file,file_index|
+ fp = ReadFile.new file
+ a = [[],[]]
+ s = Set.new
+ while line = fp.gets
+ p0, p1 = line.strip.split "\t"
+ a[0] << p0
+ a[1] << p1
+ s << a[conf[:fields][file_index]].last
+ end
+ all << a
+ all_sets << s
+}
+
+joint_set = all_sets.pop
+all_sets.each { |set|
+ joint_set &= set
+}
+sample = joint_set.to_a.shuffle.take conf[:n]
+
+outputs = []
+all.each_with_index { |a,i|
+ o = [[],[]]
+ a[conf[:fields][i]].each_with_index { |segment,j|
+ if sample.include? segment
+ o[0] << a[0][j]
+ o[1] << a[1][j]
+ end
+ }
+ outputs << o
+}
+
+outputs.each_with_index { |o,i|
+ f = WriteFile.new (conf[:tsv][i] + ".joint")
+ o[0].each_index { |j|
+ f.write o[0][j] + "\t" + o[1][j] + "\n"
+ }
+}
+
diff --git a/tsv-uniq b/tsv-uniq
new file mode 100755
index 0000000..fde79f2
--- /dev/null
+++ b/tsv-uniq
@@ -0,0 +1,49 @@
+#!/usr/bin/env ruby
+
+require 'set'
+
+strictness = ARGV[0].to_i # 1 one-side
+ # 2 just the pair
+ # 3 the pair and one side
+
+if strictness == 1 or strictness == 3
+ side = ARGV[1].to_i # 0 or 1
+end
+
+segments = [[],[]]
+while line = STDIN.gets
+ src, tgt = line.strip.split "\t"
+ segments[0] << src
+ segments[1] << tgt
+end
+
+if strictness == 1
+ seen = Set.new
+ segments[side].each_with_index { |segment,i|
+ if not seen.include? segment
+ puts "#{segments[i][0]}\t#{segments[i][1]}"
+ end
+ seen << segment
+ }
+elsif strictness == 2
+ seen = Set.new
+ segments[0].each_index { |i|
+ segment_pair = [segments[i][0], segments[i][1]]
+ if not seen.include? segment_pair
+ puts "#{segment_pair[0]}\t#{segment_pair[1]}"
+ end
+ seen << segment_pair
+ }
+elsif strictness == 3
+ seen = Set.new
+ seen_pairs = Set.new
+ segments[side].each_with_index { |segment,i|
+ segment_pair = [segments[0][i], segments[1][i]]
+ if not seen_pairs.include? segment_pair and not seen.include? segment
+ puts "#{segment_pair[0]}\t#{segment_pair[1]}"
+ end
+ seen << segment
+ seen_pairs << segment_pair
+ }
+end
+