summaryrefslogtreecommitdiff
path: root/bitext-filter-length
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2017-12-14 22:47:11 +0000
committerPatrick Simianer <p@simianer.de>2017-12-14 22:47:11 +0000
commitab0f845d69b18905915863769facc238090a273c (patch)
tree4fde3456dd6f4a01e7ac33609d1f122ae8d46467 /bitext-filter-length
parent5a53215ed46e12db68cdd321a6e1228956b163e0 (diff)
bitext-filter-length
Diffstat (limited to 'bitext-filter-length')
-rwxr-xr-xbitext-filter-length56
1 files changed, 56 insertions, 0 deletions
diff --git a/bitext-filter-length b/bitext-filter-length
new file mode 100755
index 0000000..d812568
--- /dev/null
+++ b/bitext-filter-length
@@ -0,0 +1,56 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+require 'trollop'
+
+def main
+ conf = Trollop::options do
+ opt :inputs, "inputs, comma separated", :type => :string, :short => "-i", :required => true
+ opt :output_suffix, "output suffix", :type => :string, :default => ".out", :short => "-S"
+ opt :min_len, "minimum length", :type => :int, :default => 1, :short => "-m"
+ opt :max_len, "maximum length", :type => :int, :default => 1000, :short => "-M"
+ #opt :ratio, "length ratio", :type => :float, :default => 0.0001, :short => "-r"
+ #opt :ratio_min_len, "minimum length to apply ratio test", :type => :int, :default => 7, :short => "-R"
+ opt :ratio_avg, "length ratio average", :type => :float, :required => true, :short => "-A"
+ opt :ratio_std, "length ratio standard deviation", :type => :float, :required => true, :short => "-T"
+ end
+
+ fna,fnb = conf[:inputs].split ','
+ a = ReadFile.new fna
+ b = ReadFile.new fnb
+ a_out = WriteFile.new fna+conf[:output_suffix]
+ b_out = WriteFile.new fnb+conf[:output_suffix]
+
+ ratio_lower = conf[:ratio_avg] - conf[:ratio_std]
+ ratio_upper = conf[:ratio_avg] + conf[:ratio_std]
+
+ while linea = a.gets
+ lineb = b.gets
+ sza = linea.strip.split.size
+ szb = lineb.strip.split.size
+ #_ = [sza,szb].map{|i|i.to_f}.sort
+ ratio = sza.to_f/szb.to_f
+ if sza >= conf[:min_len] and szb >= conf[:min_len] and
+ sza <= conf[:max_len] and szb <= conf[:max_len] and
+ ratio >= ratio_lower and
+ ratio <= ratio_upper
+ #if _[0] >= conf[:ratio_min_len]
+ # ratio_ok = (_[0] / _[1]) >= conf[:ratio]
+ #else
+ # ratio_ok = true
+ #end
+ #if ratio_ok
+ a_out.write linea
+ b_out.write lineb
+ #end
+ end
+ end
+
+ a.close
+ b.close
+ a_out.close
+ b_out.close
+end
+
+main
+