diff options
author | Patrick Simianer <pks@pks.rocks> | 2018-03-29 18:21:43 +0200 |
---|---|---|
committer | Patrick Simianer <pks@pks.rocks> | 2018-03-29 18:21:43 +0200 |
commit | 042ec007c64bb168ecf8c9580f8866cc017c4896 (patch) | |
tree | 53b001102a4c1a92b61e9a24b865412b794343f8 | |
parent | 102feddfb3033640abf2916a17cb9394d94fd638 (diff) |
bitext-filter-length
-rwxr-xr-x | bitext-filter-length | 28 |
1 files changed, 10 insertions, 18 deletions
diff --git a/bitext-filter-length b/bitext-filter-length index d812568..f3ed800 100755 --- a/bitext-filter-length +++ b/bitext-filter-length @@ -9,10 +9,9 @@ def main opt :output_suffix, "output suffix", :type => :string, :default => ".out", :short => "-S" opt :min_len, "minimum length", :type => :int, :default => 1, :short => "-m" opt :max_len, "maximum length", :type => :int, :default => 1000, :short => "-M" - #opt :ratio, "length ratio", :type => :float, :default => 0.0001, :short => "-r" - #opt :ratio_min_len, "minimum length to apply ratio test", :type => :int, :default => 7, :short => "-R" - opt :ratio_avg, "length ratio average", :type => :float, :required => true, :short => "-A" - opt :ratio_std, "length ratio standard deviation", :type => :float, :required => true, :short => "-T" + opt :ignore_below, "minimum length to apply ratio test", :type => :int, :default => 7, :short => "-I" + opt :ratio_mean, "length ratio average", :type => :float, :required => true, :short => "-A" + opt :ratio_stddev, "length ratio standard deviation", :type => :float, :required => true, :short => "-T" end fna,fnb = conf[:inputs].split ',' @@ -21,28 +20,21 @@ def main a_out = WriteFile.new fna+conf[:output_suffix] b_out = WriteFile.new fnb+conf[:output_suffix] - ratio_lower = conf[:ratio_avg] - conf[:ratio_std] - ratio_upper = conf[:ratio_avg] + conf[:ratio_std] + ratio_lower = conf[:ratio_mean] - conf[:ratio_stddev] + ratio_upper = conf[:ratio_mean] + conf[:ratio_stddev] while linea = a.gets lineb = b.gets sza = linea.strip.split.size szb = lineb.strip.split.size - #_ = [sza,szb].map{|i|i.to_f}.sort ratio = sza.to_f/szb.to_f - if sza >= conf[:min_len] and szb >= conf[:min_len] and - sza <= conf[:max_len] and szb <= conf[:max_len] and - ratio >= ratio_lower and - ratio <= ratio_upper - #if _[0] >= conf[:ratio_min_len] - # ratio_ok = (_[0] / _[1]) >= conf[:ratio] - #else - # ratio_ok = true - #end - #if ratio_ok + if sza <= conf[:ignore_below] and szb <= conf[:ignore_below] or + (sza >= conf[:min_len] and szb >= conf[:min_len] and + sza <= conf[:max_len] and szb <= conf[:max_len] and + ratio >= ratio_lower and + ratio <= ratio_upper) a_out.write linea b_out.write lineb - #end end end |