summaryrefslogtreecommitdiff
path: root/tsv-joint-set
diff options
context:
space:
mode:
authorPatrick Simianer <pks@pks.rocks>2020-08-12 07:32:06 +0200
committerPatrick Simianer <pks@pks.rocks>2020-08-12 07:32:06 +0200
commit64e8bdba930479249b8dfbc4b5d4b659a95433f0 (patch)
treee26969b03d8380ee8d2cbc1328f851772006133c /tsv-joint-set
parent74e20e00dfbffdcf117778049e47acd79e320110 (diff)
parent4732fb3be94ba3f88b18295cf1c00e8c616eec73 (diff)
Merge branch 'master' of ssh://github.com/pks/nlp_scripts
Diffstat (limited to 'tsv-joint-set')
-rwxr-xr-xtsv-joint-set53
1 files changed, 53 insertions, 0 deletions
diff --git a/tsv-joint-set b/tsv-joint-set
new file mode 100755
index 0000000..c0dbdcf
--- /dev/null
+++ b/tsv-joint-set
@@ -0,0 +1,53 @@
+#!/usr/bin/env ruby
+
+require 'set'
+require 'zipf'
+require 'optimist'
+
+conf = Optimist::options do
+ opt :n, "Desired number segments in test set.", :type => :int, :required => true
+ opt :tsv, ".tsv files", :type => :strings, :required => true
+ opt :fields, "Which field (0 or 1) to use for each file", :type => :ints, :required => true
+end
+
+all = []
+all_sets = []
+conf[:tsv].each_with_index { |file,file_index|
+ fp = ReadFile.new file
+ a = [[],[]]
+ s = Set.new
+ while line = fp.gets
+ p0, p1 = line.strip.split "\t"
+ a[0] << p0
+ a[1] << p1
+ s << a[conf[:fields][file_index]].last
+ end
+ all << a
+ all_sets << s
+}
+
+joint_set = all_sets.pop
+all_sets.each { |set|
+ joint_set &= set
+}
+sample = joint_set.to_a.shuffle.take conf[:n]
+
+outputs = []
+all.each_with_index { |a,i|
+ o = [[],[]]
+ a[conf[:fields][i]].each_with_index { |segment,j|
+ if sample.include? segment
+ o[0] << a[0][j]
+ o[1] << a[1][j]
+ end
+ }
+ outputs << o
+}
+
+outputs.each_with_index { |o,i|
+ f = WriteFile.new (conf[:tsv][i] + ".joint")
+ o[0].each_index { |j|
+ f.write o[0][j] + "\t" + o[1][j] + "\n"
+ }
+}
+