diff options
author | Patrick Simianer <p@simianer.de> | 2016-07-05 11:01:46 +0200 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2016-07-05 11:01:46 +0200 |
commit | 2b1d7f881c19c4d4b5afae194e02d3300c7675d0 (patch) | |
tree | 5a06ee7de98640a39244b57bb369697176b44ebf /train-test-split | |
parent | 69949dda35c3ea21d8e926e5f0a596a0a0f61c6a (diff) |
mv
Diffstat (limited to 'train-test-split')
-rwxr-xr-x | train-test-split | 50 |
1 files changed, 50 insertions, 0 deletions
diff --git a/train-test-split b/train-test-split new file mode 100755 index 0000000..4d8153a --- /dev/null +++ b/train-test-split @@ -0,0 +1,50 @@ +#!/usr/bin/env ruby + +require 'zipf' +require 'trollop' + +conf = Trollop::options do + opt :foreign, "foreign file", :type => :string, :required => true + opt :english, "english file", :type => :string, :required => true + opt :size, "one size", :type => :int, :required => true + opt :repeat, "number of repetitions", :type => :int, :default => 1 + opt :prefix, "prefix for output files", :type => :string +end +fn = conf[:foreign] +fn_ext = fn.split('.').last +f = ReadFile.readlines fn +en = conf[:english] +en_ext = en.split('.').last +e = ReadFile.readlines en +size = conf[:size] +nlines_f = `wc -l #{fn}`.split()[0].to_i +nlines_e = `wc -l #{en}`.split()[0].to_i +if nlines_f != nlines_e + STDERR.write "Unbalanced files (#{nlines_f} vs. #{nlines_e}), exiting!\n" + exit 1 +end + +prefix = conf[:prefix] +a = (0..nlines_e-1).to_a +i = 0 +conf[:repeat].times { + b = a.sample(size) + ax = a.reject{|j| b.include? j} + `mkdir split_#{i}` + new_f = WriteFile.new "split_#{i}/#{prefix}.train.#{i}.#{fn_ext}" + new_e = WriteFile.new "split_#{i}/#{prefix}.train.#{i}.#{en_ext}" + ax.each { |j| + new_f.write f[j] + new_e.write e[j] + } + new_f.close; new_e.close + new_f = WriteFile.new "split_#{i}/#{prefix}.test.#{i}.#{fn_ext}" + new_e = WriteFile.new "split_#{i}/#{prefix}.test.#{i}.#{en_ext}" + b.each { |j| + new_f.write f[j] + new_e.write e[j] + } + new_f.close; new_e.close + i += 1 +} + |