summaryrefslogtreecommitdiff
path: root/bitext-filter-length
diff options
context:
space:
mode:
authorPatrick Simianer <pks@pks.rocks>2018-01-30 13:15:58 +0100
committerPatrick Simianer <pks@pks.rocks>2018-01-30 13:15:58 +0100
commit102feddfb3033640abf2916a17cb9394d94fd638 (patch)
tree9be5d2eea86f197f5658ea7f9758bb7093ad19fe /bitext-filter-length
parentc3caa66919439ff5f92733f5ee7825c4e6783f23 (diff)
parentbda3b2633935b3e217b17406ed0a379ffb97c2a1 (diff)
Merge branch 'master' of github.com:pks/nlp_scripts
Diffstat (limited to 'bitext-filter-length')
-rwxr-xr-xbitext-filter-length56
1 files changed, 56 insertions, 0 deletions
diff --git a/bitext-filter-length b/bitext-filter-length
new file mode 100755
index 0000000..d812568
--- /dev/null
+++ b/bitext-filter-length
@@ -0,0 +1,56 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+require 'trollop'
+
+def main
+ conf = Trollop::options do
+ opt :inputs, "inputs, comma separated", :type => :string, :short => "-i", :required => true
+ opt :output_suffix, "output suffix", :type => :string, :default => ".out", :short => "-S"
+ opt :min_len, "minimum length", :type => :int, :default => 1, :short => "-m"
+ opt :max_len, "maximum length", :type => :int, :default => 1000, :short => "-M"
+ #opt :ratio, "length ratio", :type => :float, :default => 0.0001, :short => "-r"
+ #opt :ratio_min_len, "minimum length to apply ratio test", :type => :int, :default => 7, :short => "-R"
+ opt :ratio_avg, "length ratio average", :type => :float, :required => true, :short => "-A"
+ opt :ratio_std, "length ratio standard deviation", :type => :float, :required => true, :short => "-T"
+ end
+
+ fna,fnb = conf[:inputs].split ','
+ a = ReadFile.new fna
+ b = ReadFile.new fnb
+ a_out = WriteFile.new fna+conf[:output_suffix]
+ b_out = WriteFile.new fnb+conf[:output_suffix]
+
+ ratio_lower = conf[:ratio_avg] - conf[:ratio_std]
+ ratio_upper = conf[:ratio_avg] + conf[:ratio_std]
+
+ while linea = a.gets
+ lineb = b.gets
+ sza = linea.strip.split.size
+ szb = lineb.strip.split.size
+ #_ = [sza,szb].map{|i|i.to_f}.sort
+ ratio = sza.to_f/szb.to_f
+ if sza >= conf[:min_len] and szb >= conf[:min_len] and
+ sza <= conf[:max_len] and szb <= conf[:max_len] and
+ ratio >= ratio_lower and
+ ratio <= ratio_upper
+ #if _[0] >= conf[:ratio_min_len]
+ # ratio_ok = (_[0] / _[1]) >= conf[:ratio]
+ #else
+ # ratio_ok = true
+ #end
+ #if ratio_ok
+ a_out.write linea
+ b_out.write lineb
+ #end
+ end
+ end
+
+ a.close
+ b.close
+ a_out.close
+ b_out.close
+end
+
+main
+