summaryrefslogtreecommitdiff
path: root/tsv-uniq
diff options
context:
space:
mode:
authorPatrick Simianer <pks@pks.rocks>2020-08-12 07:32:06 +0200
committerPatrick Simianer <pks@pks.rocks>2020-08-12 07:32:06 +0200
commit64e8bdba930479249b8dfbc4b5d4b659a95433f0 (patch)
treee26969b03d8380ee8d2cbc1328f851772006133c /tsv-uniq
parent74e20e00dfbffdcf117778049e47acd79e320110 (diff)
parent4732fb3be94ba3f88b18295cf1c00e8c616eec73 (diff)
Merge branch 'master' of ssh://github.com/pks/nlp_scripts
Diffstat (limited to 'tsv-uniq')
-rwxr-xr-xtsv-uniq49
1 files changed, 49 insertions, 0 deletions
diff --git a/tsv-uniq b/tsv-uniq
new file mode 100755
index 0000000..fde79f2
--- /dev/null
+++ b/tsv-uniq
@@ -0,0 +1,49 @@
+#!/usr/bin/env ruby
+
+require 'set'
+
+strictness = ARGV[0].to_i # 1 one-side
+ # 2 just the pair
+ # 3 the pair and one side
+
+if strictness == 1 or strictness == 3
+ side = ARGV[1].to_i # 0 or 1
+end
+
+segments = [[],[]]
+while line = STDIN.gets
+ src, tgt = line.strip.split "\t"
+ segments[0] << src
+ segments[1] << tgt
+end
+
+if strictness == 1
+ seen = Set.new
+ segments[side].each_with_index { |segment,i|
+ if not seen.include? segment
+ puts "#{segments[i][0]}\t#{segments[i][1]}"
+ end
+ seen << segment
+ }
+elsif strictness == 2
+ seen = Set.new
+ segments[0].each_index { |i|
+ segment_pair = [segments[i][0], segments[i][1]]
+ if not seen.include? segment_pair
+ puts "#{segment_pair[0]}\t#{segment_pair[1]}"
+ end
+ seen << segment_pair
+ }
+elsif strictness == 3
+ seen = Set.new
+ seen_pairs = Set.new
+ segments[side].each_with_index { |segment,i|
+ segment_pair = [segments[0][i], segments[1][i]]
+ if not seen_pairs.include? segment_pair and not seen.include? segment
+ puts "#{segment_pair[0]}\t#{segment_pair[1]}"
+ end
+ seen << segment
+ seen_pairs << segment_pair
+ }
+end
+