diff options
author | Patrick Simianer <p@simianer.de> | 2019-12-24 09:58:43 +0100 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2019-12-24 09:58:43 +0100 |
commit | 44701457f800fa53959625a98ec1972269b64cfd (patch) | |
tree | 8a5495761ce0ee201cedb03e9f891e08ffb85a01 /biuniq | |
parent | 3ba9189598e5d862a119071640dcb666cda98634 (diff) |
biuniq: uniquify a parallel corpus with a dictionary
Diffstat (limited to 'biuniq')
-rwxr-xr-x | biuniq | 35 |
1 files changed, 35 insertions, 0 deletions
@@ -0,0 +1,35 @@ +#!/usr/bin/env ruby + +require 'zipf' + +f1 = ReadFile.new ARGV[0] +f2 = ReadFile.new ARGV[1] + +d1 = {} +d2 = {} +a1 = [] +a2 = [] + +while line1 = f1.gets + line1.strip! + + line2 = f2.gets + line2.strip! + + if !d1.include? line1 and !d2.include? line2 + a1 << line1 + a2 << line2 + end + + d1[line1] = true + d1[line2] = true +end + +o1 = WriteFile.new ARGV[0]+".out" +o2 = WriteFile.new ARGV[1]+".out" + +a1.each_with_index { |line1,i| + o1.write line1 + "\n" + o2.write a2[i] + "\n" +} + |