diff options
author | Patrick Simianer <patrick@lilt.com> | 2020-02-19 16:19:45 +0100 |
---|---|---|
committer | Patrick Simianer <patrick@lilt.com> | 2020-02-19 16:19:45 +0100 |
commit | 5178c4f31dd3b8eb1f1cba2b632863f8a92af029 (patch) | |
tree | 1b3b5d0f9f06f8cf658b081675a20a2354dbdf86 /tsv-uniq | |
parent | 8c1c7c21f16e5800d615130831a8932a5d2b2bd2 (diff) |
TSV utils
Diffstat (limited to 'tsv-uniq')
-rwxr-xr-x | tsv-uniq | 49 |
1 files changed, 49 insertions, 0 deletions
diff --git a/tsv-uniq b/tsv-uniq new file mode 100755 index 0000000..fde79f2 --- /dev/null +++ b/tsv-uniq @@ -0,0 +1,49 @@ +#!/usr/bin/env ruby + +require 'set' + +strictness = ARGV[0].to_i # 1 one-side + # 2 just the pair + # 3 the pair and one side + +if strictness == 1 or strictness == 3 + side = ARGV[1].to_i # 0 or 1 +end + +segments = [[],[]] +while line = STDIN.gets + src, tgt = line.strip.split "\t" + segments[0] << src + segments[1] << tgt +end + +if strictness == 1 + seen = Set.new + segments[side].each_with_index { |segment,i| + if not seen.include? segment + puts "#{segments[i][0]}\t#{segments[i][1]}" + end + seen << segment + } +elsif strictness == 2 + seen = Set.new + segments[0].each_index { |i| + segment_pair = [segments[i][0], segments[i][1]] + if not seen.include? segment_pair + puts "#{segment_pair[0]}\t#{segment_pair[1]}" + end + seen << segment_pair + } +elsif strictness == 3 + seen = Set.new + seen_pairs = Set.new + segments[side].each_with_index { |segment,i| + segment_pair = [segments[0][i], segments[1][i]] + if not seen_pairs.include? segment_pair and not seen.include? segment + puts "#{segment_pair[0]}\t#{segment_pair[1]}" + end + seen << segment + seen_pairs << segment_pair + } +end + |