From ab0f845d69b18905915863769facc238090a273c Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Thu, 14 Dec 2017 22:47:11 +0000 Subject: bitext-filter-length --- bitext-filter-length | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100755 bitext-filter-length diff --git a/bitext-filter-length b/bitext-filter-length new file mode 100755 index 0000000..d812568 --- /dev/null +++ b/bitext-filter-length @@ -0,0 +1,56 @@ +#!/usr/bin/env ruby + +require 'zipf' +require 'trollop' + +def main + conf = Trollop::options do + opt :inputs, "inputs, comma separated", :type => :string, :short => "-i", :required => true + opt :output_suffix, "output suffix", :type => :string, :default => ".out", :short => "-S" + opt :min_len, "minimum length", :type => :int, :default => 1, :short => "-m" + opt :max_len, "maximum length", :type => :int, :default => 1000, :short => "-M" + #opt :ratio, "length ratio", :type => :float, :default => 0.0001, :short => "-r" + #opt :ratio_min_len, "minimum length to apply ratio test", :type => :int, :default => 7, :short => "-R" + opt :ratio_avg, "length ratio average", :type => :float, :required => true, :short => "-A" + opt :ratio_std, "length ratio standard deviation", :type => :float, :required => true, :short => "-T" + end + + fna,fnb = conf[:inputs].split ',' + a = ReadFile.new fna + b = ReadFile.new fnb + a_out = WriteFile.new fna+conf[:output_suffix] + b_out = WriteFile.new fnb+conf[:output_suffix] + + ratio_lower = conf[:ratio_avg] - conf[:ratio_std] + ratio_upper = conf[:ratio_avg] + conf[:ratio_std] + + while linea = a.gets + lineb = b.gets + sza = linea.strip.split.size + szb = lineb.strip.split.size + #_ = [sza,szb].map{|i|i.to_f}.sort + ratio = sza.to_f/szb.to_f + if sza >= conf[:min_len] and szb >= conf[:min_len] and + sza <= conf[:max_len] and szb <= conf[:max_len] and + ratio >= ratio_lower and + ratio <= ratio_upper + #if _[0] >= conf[:ratio_min_len] + # ratio_ok = (_[0] / _[1]) >= conf[:ratio] + #else + # ratio_ok = true + #end + #if ratio_ok + a_out.write linea + b_out.write lineb + #end + end + end + + a.close + b.close + a_out.close + b_out.close +end + +main + -- cgit v1.2.3