summaryrefslogtreecommitdiff
path: root/bitext-filter-length
blob: d812568f03db34f246cb29faffe67738633ff38a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/usr/bin/env ruby

require 'zipf'
require 'trollop'

def main
  conf = Trollop::options do
    opt :inputs, "inputs, comma separated", :type => :string, :short => "-i", :required => true
    opt :output_suffix, "output suffix", :type => :string, :default => ".out", :short => "-S"
    opt :min_len, "minimum length", :type => :int, :default => 1, :short => "-m"
    opt :max_len, "maximum length", :type => :int, :default => 1000, :short => "-M"
    #opt :ratio, "length ratio", :type => :float, :default => 0.0001, :short => "-r"
    #opt :ratio_min_len, "minimum length to apply ratio test", :type => :int, :default => 7, :short => "-R"
    opt :ratio_avg, "length ratio average", :type => :float, :required => true, :short => "-A"
    opt :ratio_std, "length ratio standard deviation", :type => :float, :required => true, :short => "-T"
  end

  fna,fnb = conf[:inputs].split ','
  a = ReadFile.new fna 
  b = ReadFile.new fnb
  a_out = WriteFile.new fna+conf[:output_suffix]
  b_out = WriteFile.new fnb+conf[:output_suffix]

  ratio_lower = conf[:ratio_avg] - conf[:ratio_std]
  ratio_upper = conf[:ratio_avg] + conf[:ratio_std]

  while linea = a.gets
    lineb = b.gets
    sza = linea.strip.split.size
    szb = lineb.strip.split.size
    #_ = [sza,szb].map{|i|i.to_f}.sort
    ratio = sza.to_f/szb.to_f
    if sza >= conf[:min_len] and szb >= conf[:min_len] and
       sza <= conf[:max_len] and szb <= conf[:max_len] and
       ratio >= ratio_lower and
       ratio <= ratio_upper
      #if _[0] >= conf[:ratio_min_len]
      #  ratio_ok = (_[0] / _[1]) >= conf[:ratio]
      #else
      #  ratio_ok = true
      #end
      #if ratio_ok
      a_out.write linea
      b_out.write lineb
      #end
    end
  end

  a.close
  b.close
  a_out.close
  b_out.close
end

main