summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2019-12-24 09:58:43 +0100
committerPatrick Simianer <p@simianer.de>2019-12-24 09:58:43 +0100
commit44701457f800fa53959625a98ec1972269b64cfd (patch)
tree8a5495761ce0ee201cedb03e9f891e08ffb85a01
parent3ba9189598e5d862a119071640dcb666cda98634 (diff)
biuniq: uniquify a parallel corpus with a dictionary
-rwxr-xr-xbiuniq35
1 files changed, 35 insertions, 0 deletions
diff --git a/biuniq b/biuniq
new file mode 100755
index 0000000..097c88a
--- /dev/null
+++ b/biuniq
@@ -0,0 +1,35 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+
+f1 = ReadFile.new ARGV[0]
+f2 = ReadFile.new ARGV[1]
+
+d1 = {}
+d2 = {}
+a1 = []
+a2 = []
+
+while line1 = f1.gets
+ line1.strip!
+
+ line2 = f2.gets
+ line2.strip!
+
+ if !d1.include? line1 and !d2.include? line2
+ a1 << line1
+ a2 << line2
+ end
+
+ d1[line1] = true
+ d1[line2] = true
+end
+
+o1 = WriteFile.new ARGV[0]+".out"
+o2 = WriteFile.new ARGV[1]+".out"
+
+a1.each_with_index { |line1,i|
+ o1.write line1 + "\n"
+ o2.write a2[i] + "\n"
+}
+