diff options
author | Patrick Simianer <patrick@lilt.com> | 2021-05-21 17:49:38 +0200 |
---|---|---|
committer | Patrick Simianer <patrick@lilt.com> | 2021-05-21 17:49:38 +0200 |
commit | cd82bf3ff48f054c9e7b7fafcdd7ad6431940db1 (patch) | |
tree | 41b59b77001db3b59333e6a447ab9aa12ac78b7d | |
parent | 8b325ba5ab9c825ec9fde53b87f0b947b0317b6c (diff) |
remove-devtest
-rwxr-xr-x | remove-devtest | 48 |
1 files changed, 48 insertions, 0 deletions
diff --git a/remove-devtest b/remove-devtest new file mode 100755 index 0000000..8e026f9 --- /dev/null +++ b/remove-devtest @@ -0,0 +1,48 @@ +#!/usr/bin/env ruby + +require 'zipf' + +train_src = ReadFile.new ARGV[0] +train_tgt = ReadFile.new ARGV[1] +devtest_src = ReadFile.new ARGV[2] +devtest_tgt = ReadFile.new ARGV[3] + +strict = false +if ARGV.size == 5 + strict = true +end + +devtest_h_src = {} +devtest_h_tgt = {} + +while line_src = devtest_src.gets + if strict + line_src.downcase! + end + devtest_h_src[line_src] = true +end +while line_tgt = devtest_tgt.gets + if strict + line_tgt.downcase! + end + devtest_h_tgt[line_tgt] = true +end + +train_src_out = WriteFile.new ARGV[0]+".out" +train_tgt_out = WriteFile.new ARGV[1]+".out" +while line_src = train_src.gets + line_tgt = train_tgt.gets + if strict + line_src_downcase = line_src.downcase + line_tgt_downcase = line_tgt.downcase + else + line_src_downcase = line_src + line_tgt_downcase = line_tgt + end + + if not devtest_h_src.has_key? line_src_downcase and not devtest_h_src.has_key? line_tgt_downcase \ + and not devtest_h_tgt.has_key? line_src_downcase and not devtest_h_tgt.has_key? line_tgt_downcase + train_src_out.write line_src + train_tgt_out.write line_tgt + end +end |