From 641e80a4ad7bff2bb0cae447cc39da0eccc662dd Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Tue, 17 Apr 2018 15:42:29 +0000 Subject: bitext-filter-length improved --- bitext-filter-length | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/bitext-filter-length b/bitext-filter-length index 7f82a65..9e65454 100755 --- a/bitext-filter-length +++ b/bitext-filter-length @@ -7,41 +7,58 @@ def main conf = Trollop::options do opt :inputs, "inputs, comma separated", :type => :string, :short => "-i", :required => true opt :output_suffix, "output suffix", :type => :string, :default => ".out", :short => "-S" + opt :output_index, "output index", :type => :bool, :default => false, :short => "-J" opt :min_len, "minimum length", :type => :int, :default => 1, :short => "-m" opt :max_len, "maximum length", :type => :int, :default => 1000, :short => "-M" opt :ignore_below, "minimum length to apply ratio test", :type => :int, :default => 7, :short => "-I" opt :ratio_mean, "length ratio average", :type => :float, :required => true, :short => "-A" opt :ratio_stddev, "length ratio standard deviation", :type => :float, :required => true, :short => "-T" + opt :stddev_mult, "+/- n stddevs", :type => :float, :default => 2.0, :short => "-N" + opt :reverse, "length ratios alway > 1", :type => :bool, :default => false, :short => "-r" end fna,fnb = conf[:inputs].split ',' a = ReadFile.new fna b = ReadFile.new fnb - a_out = WriteFile.new fna+conf[:output_suffix] - b_out = WriteFile.new fnb+conf[:output_suffix] - ratio_lower = conf[:ratio_mean] - conf[:ratio_stddev] - ratio_upper = conf[:ratio_mean] + conf[:ratio_stddev] + if not conf[:output_index] + a_out = WriteFile.new fna+conf[:output_suffix] + b_out = WriteFile.new fnb+conf[:output_suffix] + end + + ratio_lower = conf[:ratio_mean] - (conf[:stddev_mult] * conf[:ratio_stddev]) + ratio_upper = conf[:ratio_mean] + (conf[:stddev_mult] * conf[:ratio_stddev]) + i = 0 while linea = a.gets lineb = b.gets sza = linea.strip.split.size szb = lineb.strip.split.size ratio = sza.to_f/szb.to_f + if conf[:reverse] and ratio < 1 + ratio = ratio**(-1) + end if (sza > 0 and sza <= conf[:ignore_below] and szb > 0 and szb <= conf[:ignore_below]) or (sza >= conf[:min_len] and szb >= conf[:min_len] and sza <= conf[:max_len] and szb <= conf[:max_len] and ratio >= ratio_lower and - ratio <= ratio_upper) - a_out.write linea - b_out.write lineb + ratio <= ratio_upper) + if not conf[:output_index] + a_out.write linea + b_out.write lineb + else + puts i + end end + i += 1 end a.close b.close - a_out.close - b_out.close + if not conf[:output_index] + a_out.close + b_out.close + end end main -- cgit v1.2.3