blob: 8e026f9f520dec53750297df2a83f7f3735a331f (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
|
#!/usr/bin/env ruby
require 'zipf'
train_src = ReadFile.new ARGV[0]
train_tgt = ReadFile.new ARGV[1]
devtest_src = ReadFile.new ARGV[2]
devtest_tgt = ReadFile.new ARGV[3]
strict = false
if ARGV.size == 5
strict = true
end
devtest_h_src = {}
devtest_h_tgt = {}
while line_src = devtest_src.gets
if strict
line_src.downcase!
end
devtest_h_src[line_src] = true
end
while line_tgt = devtest_tgt.gets
if strict
line_tgt.downcase!
end
devtest_h_tgt[line_tgt] = true
end
train_src_out = WriteFile.new ARGV[0]+".out"
train_tgt_out = WriteFile.new ARGV[1]+".out"
while line_src = train_src.gets
line_tgt = train_tgt.gets
if strict
line_src_downcase = line_src.downcase
line_tgt_downcase = line_tgt.downcase
else
line_src_downcase = line_src
line_tgt_downcase = line_tgt
end
if not devtest_h_src.has_key? line_src_downcase and not devtest_h_src.has_key? line_tgt_downcase \
and not devtest_h_tgt.has_key? line_src_downcase and not devtest_h_tgt.has_key? line_tgt_downcase
train_src_out.write line_src
train_tgt_out.write line_tgt
end
end
|