diff options
author | Patrick Simianer <p@simianer.de> | 2013-12-11 20:29:42 +0100 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2013-12-11 20:29:42 +0100 |
commit | 820c06de59307ad06557a603ffee08679fcf3e43 (patch) | |
tree | 40e95e04d66c4dd184aca7553960e12297d0ea4a | |
parent | 1dcf5842ce8a684a4943d3a67a8eb1c210359851 (diff) |
train test split script
-rwxr-xr-x | traintestsplit | 55 |
1 files changed, 55 insertions, 0 deletions
diff --git a/traintestsplit b/traintestsplit new file mode 100755 index 0000000..7ec52ae --- /dev/null +++ b/traintestsplit @@ -0,0 +1,55 @@ +#!/usr/bin/env ruby + +require 'trollop' + + +def main + opts = Trollop::options do + opt :foreign, "foreign file", :type => :string, :required => true + opt :english, "english file", :type => :string, :required => true + opt :size, "one size", :type => :int, :required => true + opt :repeat, "number of repetitions", :type => :int, :default => 1 + opt :prefix, "prefix for output files", :type => :string + end + fn = opts[:foreign] + fn_ext = fn.split('.').last + f = File.new(fn, 'r').readlines + en = opts[:english] + en_ext = en.split('.').last + e = File.new(en, 'r').readlines + size = opts[:size] + nlines_f = `wc -l #{fn}`.split()[0].to_i + nlines_e = `wc -l #{en}`.split()[0].to_i + if nlines_f!=nlines_e + STDERR.write "Unbalanced files (#{nlines_f} vs. #{nlines_e}), exiting!\n" + exit 1 + end + + prefix = opts[:prefix] + a = (0..nlines_e-1).to_a + i = 0 + opts[:repeat].times { + b = a.sample(size) + ax = a.reject{|j| b.include? j} + `mkdir split_#{i}` + new_f = File.new "split_#{i}/#{prefix}.train.#{i}.#{fn_ext}", 'w+' + new_e = File.new "split_#{i}/#{prefix}.train.#{i}.#{en_ext}", 'w+' + ax.each { |j| + new_f.write f[j] + new_e.write e[j] + } + new_f.close; new_e.close + new_f = File.new "split_#{i}/#{prefix}.test.#{i}.#{fn_ext}", 'w+' + new_e = File.new "split_#{i}/#{prefix}.test.#{i}.#{en_ext}", 'w+' + b.each { |j| + new_f.write f[j] + new_e.write e[j] + } + new_f.close; new_e.close + i += 1 + } +end + + +main + |