summaryrefslogtreecommitdiff
path: root/remove-devtest
blob: 8e026f9f520dec53750297df2a83f7f3735a331f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/usr/bin/env ruby

require 'zipf'

train_src = ReadFile.new ARGV[0]
train_tgt = ReadFile.new ARGV[1]
devtest_src = ReadFile.new ARGV[2]
devtest_tgt = ReadFile.new ARGV[3]

strict = false
if ARGV.size == 5
  strict = true
end

devtest_h_src = {}
devtest_h_tgt = {}

while line_src = devtest_src.gets
  if strict
    line_src.downcase!
  end
  devtest_h_src[line_src] = true
end
while line_tgt = devtest_tgt.gets
  if strict
    line_tgt.downcase!
  end
  devtest_h_tgt[line_tgt] = true
end

train_src_out = WriteFile.new ARGV[0]+".out"
train_tgt_out = WriteFile.new ARGV[1]+".out"
while line_src = train_src.gets
  line_tgt = train_tgt.gets
  if strict
    line_src_downcase = line_src.downcase
    line_tgt_downcase = line_tgt.downcase
  else
    line_src_downcase = line_src
    line_tgt_downcase = line_tgt
  end
  
  if not devtest_h_src.has_key? line_src_downcase and not devtest_h_src.has_key? line_tgt_downcase \
 and not devtest_h_tgt.has_key? line_src_downcase and not devtest_h_tgt.has_key? line_tgt_downcase
    train_src_out.write line_src
    train_tgt_out.write line_tgt
  end
end