diff options
author | Patrick Simianer <patrick@lilt.com> | 2021-05-21 17:51:32 +0200 |
---|---|---|
committer | Patrick Simianer <patrick@lilt.com> | 2021-05-21 17:51:32 +0200 |
commit | dbf721675208074eae176dd77d4286d4697e8bdc (patch) | |
tree | f7ea1a29530581137a7d53fdfc6c353fb3522f63 /remove-test-from-bitext | |
parent | 01b3e6b8ab26e8e9b6940f4e2bd8c1ed558f843f (diff) |
remove-test-from-bitext
Diffstat (limited to 'remove-test-from-bitext')
-rwxr-xr-x | remove-test-from-bitext | 45 |
1 files changed, 45 insertions, 0 deletions
diff --git a/remove-test-from-bitext b/remove-test-from-bitext new file mode 100755 index 0000000..43038d3 --- /dev/null +++ b/remove-test-from-bitext @@ -0,0 +1,45 @@ +#!/usr/bin/env ruby + +require 'zipf' + +test_source = ReadFile.new ARGV[0] +test_target = ReadFile.new ARGV[1] +all_test_source_lines = {} +all_test_source_lines.default = false +all_test_target_lines = {} +all_test_source_lines.default = false + +while test_source_line = test_source.gets + test_source_line.strip! + test_target_line = test_target.gets + test_target_line.strip! + + all_test_source_lines[test_source_line] = true + all_test_target_lines[test_target_line] = true +end + +train_source = ReadFile.new ARGV[2] +train_target = ReadFile.new ARGV[3] + +train_source_out = WriteFile.new "#{ARGV[2]}.out" +train_target_out = WriteFile.new "#{ARGV[3]}.out" + +while train_source_line = train_source.gets + train_source_line.strip! + train_target_line = train_target.gets + if train_target_line == nil then train_target_line = "" end + train_target_line.strip! + + if not all_test_source_lines[train_source_line] \ + and not all_test_target_lines[train_target_line] + train_source_out.write train_source_line + "\n" + train_target_out.write train_target_line + "\n" + end +end + +test_source.close +test_target.close +train_source.close +train_target.close +train_source_out.close +train_target_out.close |