summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Simianer <patrick@lilt.com>2021-05-21 17:51:32 +0200
committerPatrick Simianer <patrick@lilt.com>2021-05-21 17:51:32 +0200
commitdbf721675208074eae176dd77d4286d4697e8bdc (patch)
treef7ea1a29530581137a7d53fdfc6c353fb3522f63
parent01b3e6b8ab26e8e9b6940f4e2bd8c1ed558f843f (diff)
remove-test-from-bitext
-rwxr-xr-xremove-test-from-bitext45
1 files changed, 45 insertions, 0 deletions
diff --git a/remove-test-from-bitext b/remove-test-from-bitext
new file mode 100755
index 0000000..43038d3
--- /dev/null
+++ b/remove-test-from-bitext
@@ -0,0 +1,45 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+
+test_source = ReadFile.new ARGV[0]
+test_target = ReadFile.new ARGV[1]
+all_test_source_lines = {}
+all_test_source_lines.default = false
+all_test_target_lines = {}
+all_test_source_lines.default = false
+
+while test_source_line = test_source.gets
+ test_source_line.strip!
+ test_target_line = test_target.gets
+ test_target_line.strip!
+
+ all_test_source_lines[test_source_line] = true
+ all_test_target_lines[test_target_line] = true
+end
+
+train_source = ReadFile.new ARGV[2]
+train_target = ReadFile.new ARGV[3]
+
+train_source_out = WriteFile.new "#{ARGV[2]}.out"
+train_target_out = WriteFile.new "#{ARGV[3]}.out"
+
+while train_source_line = train_source.gets
+ train_source_line.strip!
+ train_target_line = train_target.gets
+ if train_target_line == nil then train_target_line = "" end
+ train_target_line.strip!
+
+ if not all_test_source_lines[train_source_line] \
+ and not all_test_target_lines[train_target_line]
+ train_source_out.write train_source_line + "\n"
+ train_target_out.write train_target_line + "\n"
+ end
+end
+
+test_source.close
+test_target.close
+train_source.close
+train_target.close
+train_source_out.close
+train_target_out.close