diff options
Diffstat (limited to 'bitext-filter-length')
| -rwxr-xr-x | bitext-filter-length | 56 | 
1 files changed, 56 insertions, 0 deletions
| diff --git a/bitext-filter-length b/bitext-filter-length new file mode 100755 index 0000000..d812568 --- /dev/null +++ b/bitext-filter-length @@ -0,0 +1,56 @@ +#!/usr/bin/env ruby + +require 'zipf' +require 'trollop' + +def main +  conf = Trollop::options do +    opt :inputs, "inputs, comma separated", :type => :string, :short => "-i", :required => true +    opt :output_suffix, "output suffix", :type => :string, :default => ".out", :short => "-S" +    opt :min_len, "minimum length", :type => :int, :default => 1, :short => "-m" +    opt :max_len, "maximum length", :type => :int, :default => 1000, :short => "-M" +    #opt :ratio, "length ratio", :type => :float, :default => 0.0001, :short => "-r" +    #opt :ratio_min_len, "minimum length to apply ratio test", :type => :int, :default => 7, :short => "-R" +    opt :ratio_avg, "length ratio average", :type => :float, :required => true, :short => "-A" +    opt :ratio_std, "length ratio standard deviation", :type => :float, :required => true, :short => "-T" +  end + +  fna,fnb = conf[:inputs].split ',' +  a = ReadFile.new fna  +  b = ReadFile.new fnb +  a_out = WriteFile.new fna+conf[:output_suffix] +  b_out = WriteFile.new fnb+conf[:output_suffix] + +  ratio_lower = conf[:ratio_avg] - conf[:ratio_std] +  ratio_upper = conf[:ratio_avg] + conf[:ratio_std] + +  while linea = a.gets +    lineb = b.gets +    sza = linea.strip.split.size +    szb = lineb.strip.split.size +    #_ = [sza,szb].map{|i|i.to_f}.sort +    ratio = sza.to_f/szb.to_f +    if sza >= conf[:min_len] and szb >= conf[:min_len] and +       sza <= conf[:max_len] and szb <= conf[:max_len] and +       ratio >= ratio_lower and +       ratio <= ratio_upper +      #if _[0] >= conf[:ratio_min_len] +      #  ratio_ok = (_[0] / _[1]) >= conf[:ratio] +      #else +      #  ratio_ok = true +      #end +      #if ratio_ok +      a_out.write linea +      b_out.write lineb +      #end +    end +  end + +  a.close +  b.close +  a_out.close +  b_out.close +end + +main + | 
