summaryrefslogtreecommitdiff
path: root/min_max
blob: f27de88a8e27c762e0e94db2f21fdb61dd68265c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#!/usr/bin/ruby

require 'trollop'


STDIN.set_encoding 'utf-8'
STDOUT.set_encoding 'utf-8'

def usage
  puts "filter-min-max.rb --min <min> --max <max> --in_f <in f> --in_e <in e> --out_f <out f> --out_e <out e> --out_id <out ids>" 
end
usage if ARGV.size!=14

opts = Trollop::options do
  opt :min, "minimum #tokens", :type => :int, :default => 1
  opt :max, "maximum #tokens", :type => :int, :default => 80
  opt :in_f "input 'French' file", :type => string
  opt :in_e "input 'English' file", :type => string
  opt :out_f "output 'French' file", :type => string
  opt :out_e "output 'English' file", :type => string
  opt :out_id "output line Nos", :type => string
end


files = {}
files[:f_file] = File.new opts[:in_f], 'r:UTF-8'
files[:e_file] = File.new opts[:in_e], 'r:UTF-8'
files[:f_out_file] = File.new opts[:out_f], 'w:UTF-8'
files[:e_out_file] = File.new opts[:out_e], 'w:UTF-8'
files[:id_out_file] = File.new opts[:out_id], 'w'
i = 0
while f_line = files[:f_file].gets
  e_line = files[:e_file].gets
  f_line.strip!
  e_line.strip!
  a = f_line.split
  b = e_line.split
  if a.size >= opts[:min] and a.size <= opts[:max] and \
      b.size >= opts[:min] and b.size <= opts[:max]
    files[:f_out_file].write "#{f_line}\n"
    files[:e_out_file].write "#{e_line}\n"
    files[:id_out_file].write "#{i}\n"
  end  
  i+=1
end
files.values.each{|f|f.close}