From 44701457f800fa53959625a98ec1972269b64cfd Mon Sep 17 00:00:00 2001
From: Patrick Simianer <p@simianer.de>
Date: Tue, 24 Dec 2019 09:58:43 +0100
Subject: biuniq: uniquify a parallel corpus with a dictionary

---
 biuniq | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100755 biuniq

(limited to 'biuniq')

diff --git a/biuniq b/biuniq
new file mode 100755
index 0000000..097c88a
--- /dev/null
+++ b/biuniq
@@ -0,0 +1,35 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+
+f1 = ReadFile.new ARGV[0]
+f2 = ReadFile.new ARGV[1]
+
+d1 = {}
+d2 = {}
+a1 = []
+a2 = []
+
+while line1 = f1.gets
+  line1.strip!
+
+  line2 = f2.gets
+  line2.strip!
+  
+  if !d1.include? line1 and !d2.include? line2
+    a1 << line1
+    a2 << line2
+  end
+
+  d1[line1] = true
+  d1[line2] = true
+end
+
+o1 = WriteFile.new ARGV[0]+".out"
+o2 = WriteFile.new ARGV[1]+".out"
+
+a1.each_with_index { |line1,i|
+  o1.write line1 + "\n"
+  o2.write a2[i] + "\n"
+}
+
-- 
cgit v1.2.3