diff options
author | Patrick Simianer <pks@pks.rocks> | 2018-01-30 13:15:58 +0100 |
---|---|---|
committer | Patrick Simianer <pks@pks.rocks> | 2018-01-30 13:15:58 +0100 |
commit | 102feddfb3033640abf2916a17cb9394d94fd638 (patch) | |
tree | 9be5d2eea86f197f5658ea7f9758bb7093ad19fe | |
parent | c3caa66919439ff5f92733f5ee7825c4e6783f23 (diff) | |
parent | bda3b2633935b3e217b17406ed0a379ffb97c2a1 (diff) |
Merge branch 'master' of github.com:pks/nlp_scripts
-rwxr-xr-x | bitext-filter-length | 56 | ||||
-rwxr-xr-x | length-ratio | 12 |
2 files changed, 68 insertions, 0 deletions
diff --git a/bitext-filter-length b/bitext-filter-length new file mode 100755 index 0000000..d812568 --- /dev/null +++ b/bitext-filter-length @@ -0,0 +1,56 @@ +#!/usr/bin/env ruby + +require 'zipf' +require 'trollop' + +def main + conf = Trollop::options do + opt :inputs, "inputs, comma separated", :type => :string, :short => "-i", :required => true + opt :output_suffix, "output suffix", :type => :string, :default => ".out", :short => "-S" + opt :min_len, "minimum length", :type => :int, :default => 1, :short => "-m" + opt :max_len, "maximum length", :type => :int, :default => 1000, :short => "-M" + #opt :ratio, "length ratio", :type => :float, :default => 0.0001, :short => "-r" + #opt :ratio_min_len, "minimum length to apply ratio test", :type => :int, :default => 7, :short => "-R" + opt :ratio_avg, "length ratio average", :type => :float, :required => true, :short => "-A" + opt :ratio_std, "length ratio standard deviation", :type => :float, :required => true, :short => "-T" + end + + fna,fnb = conf[:inputs].split ',' + a = ReadFile.new fna + b = ReadFile.new fnb + a_out = WriteFile.new fna+conf[:output_suffix] + b_out = WriteFile.new fnb+conf[:output_suffix] + + ratio_lower = conf[:ratio_avg] - conf[:ratio_std] + ratio_upper = conf[:ratio_avg] + conf[:ratio_std] + + while linea = a.gets + lineb = b.gets + sza = linea.strip.split.size + szb = lineb.strip.split.size + #_ = [sza,szb].map{|i|i.to_f}.sort + ratio = sza.to_f/szb.to_f + if sza >= conf[:min_len] and szb >= conf[:min_len] and + sza <= conf[:max_len] and szb <= conf[:max_len] and + ratio >= ratio_lower and + ratio <= ratio_upper + #if _[0] >= conf[:ratio_min_len] + # ratio_ok = (_[0] / _[1]) >= conf[:ratio] + #else + # ratio_ok = true + #end + #if ratio_ok + a_out.write linea + b_out.write lineb + #end + end + end + + a.close + b.close + a_out.close + b_out.close +end + +main + diff --git a/length-ratio b/length-ratio new file mode 100755 index 0000000..4b4432d --- /dev/null +++ b/length-ratio @@ -0,0 +1,12 @@ +#!/usr/bin/env ruby + +require 'zipf' + +a = ReadFile.new ARGV[0] +b = ReadFile.new ARGV[1] + +while linea = a.gets + lineb = b.gets + puts linea.strip.split.size.to_f / lineb.strip.split.size.to_f +end + |