diff options
author | Patrick Simianer <pks@pks.rocks> | 2020-08-12 07:32:06 +0200 |
---|---|---|
committer | Patrick Simianer <pks@pks.rocks> | 2020-08-12 07:32:06 +0200 |
commit | 64e8bdba930479249b8dfbc4b5d4b659a95433f0 (patch) | |
tree | e26969b03d8380ee8d2cbc1328f851772006133c /tsv-uniq | |
parent | 74e20e00dfbffdcf117778049e47acd79e320110 (diff) | |
parent | 4732fb3be94ba3f88b18295cf1c00e8c616eec73 (diff) |
Merge branch 'master' of ssh://github.com/pks/nlp_scripts
Diffstat (limited to 'tsv-uniq')
-rwxr-xr-x | tsv-uniq | 49 |
1 files changed, 49 insertions, 0 deletions
diff --git a/tsv-uniq b/tsv-uniq new file mode 100755 index 0000000..fde79f2 --- /dev/null +++ b/tsv-uniq @@ -0,0 +1,49 @@ +#!/usr/bin/env ruby + +require 'set' + +strictness = ARGV[0].to_i # 1 one-side + # 2 just the pair + # 3 the pair and one side + +if strictness == 1 or strictness == 3 + side = ARGV[1].to_i # 0 or 1 +end + +segments = [[],[]] +while line = STDIN.gets + src, tgt = line.strip.split "\t" + segments[0] << src + segments[1] << tgt +end + +if strictness == 1 + seen = Set.new + segments[side].each_with_index { |segment,i| + if not seen.include? segment + puts "#{segments[i][0]}\t#{segments[i][1]}" + end + seen << segment + } +elsif strictness == 2 + seen = Set.new + segments[0].each_index { |i| + segment_pair = [segments[i][0], segments[i][1]] + if not seen.include? segment_pair + puts "#{segment_pair[0]}\t#{segment_pair[1]}" + end + seen << segment_pair + } +elsif strictness == 3 + seen = Set.new + seen_pairs = Set.new + segments[side].each_with_index { |segment,i| + segment_pair = [segments[0][i], segments[1][i]] + if not seen_pairs.include? segment_pair and not seen.include? segment + puts "#{segment_pair[0]}\t#{segment_pair[1]}" + end + seen << segment + seen_pairs << segment_pair + } +end + |