summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Simianer <pks@pks.rocks>2018-03-29 18:21:43 +0200
committerPatrick Simianer <pks@pks.rocks>2018-03-29 18:21:43 +0200
commit042ec007c64bb168ecf8c9580f8866cc017c4896 (patch)
tree53b001102a4c1a92b61e9a24b865412b794343f8
parent102feddfb3033640abf2916a17cb9394d94fd638 (diff)
bitext-filter-length
-rwxr-xr-xbitext-filter-length28
1 files changed, 10 insertions, 18 deletions
diff --git a/bitext-filter-length b/bitext-filter-length
index d812568..f3ed800 100755
--- a/bitext-filter-length
+++ b/bitext-filter-length
@@ -9,10 +9,9 @@ def main
opt :output_suffix, "output suffix", :type => :string, :default => ".out", :short => "-S"
opt :min_len, "minimum length", :type => :int, :default => 1, :short => "-m"
opt :max_len, "maximum length", :type => :int, :default => 1000, :short => "-M"
- #opt :ratio, "length ratio", :type => :float, :default => 0.0001, :short => "-r"
- #opt :ratio_min_len, "minimum length to apply ratio test", :type => :int, :default => 7, :short => "-R"
- opt :ratio_avg, "length ratio average", :type => :float, :required => true, :short => "-A"
- opt :ratio_std, "length ratio standard deviation", :type => :float, :required => true, :short => "-T"
+ opt :ignore_below, "minimum length to apply ratio test", :type => :int, :default => 7, :short => "-I"
+ opt :ratio_mean, "length ratio average", :type => :float, :required => true, :short => "-A"
+ opt :ratio_stddev, "length ratio standard deviation", :type => :float, :required => true, :short => "-T"
end
fna,fnb = conf[:inputs].split ','
@@ -21,28 +20,21 @@ def main
a_out = WriteFile.new fna+conf[:output_suffix]
b_out = WriteFile.new fnb+conf[:output_suffix]
- ratio_lower = conf[:ratio_avg] - conf[:ratio_std]
- ratio_upper = conf[:ratio_avg] + conf[:ratio_std]
+ ratio_lower = conf[:ratio_mean] - conf[:ratio_stddev]
+ ratio_upper = conf[:ratio_mean] + conf[:ratio_stddev]
while linea = a.gets
lineb = b.gets
sza = linea.strip.split.size
szb = lineb.strip.split.size
- #_ = [sza,szb].map{|i|i.to_f}.sort
ratio = sza.to_f/szb.to_f
- if sza >= conf[:min_len] and szb >= conf[:min_len] and
- sza <= conf[:max_len] and szb <= conf[:max_len] and
- ratio >= ratio_lower and
- ratio <= ratio_upper
- #if _[0] >= conf[:ratio_min_len]
- # ratio_ok = (_[0] / _[1]) >= conf[:ratio]
- #else
- # ratio_ok = true
- #end
- #if ratio_ok
+ if sza <= conf[:ignore_below] and szb <= conf[:ignore_below] or
+ (sza >= conf[:min_len] and szb >= conf[:min_len] and
+ sza <= conf[:max_len] and szb <= conf[:max_len] and
+ ratio >= ratio_lower and
+ ratio <= ratio_upper)
a_out.write linea
b_out.write lineb
- #end
end
end