summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2013-12-11 20:29:42 +0100
committerPatrick Simianer <p@simianer.de>2013-12-11 20:29:42 +0100
commit820c06de59307ad06557a603ffee08679fcf3e43 (patch)
tree40e95e04d66c4dd184aca7553960e12297d0ea4a
parent1dcf5842ce8a684a4943d3a67a8eb1c210359851 (diff)
train test split script
-rwxr-xr-xtraintestsplit55
1 files changed, 55 insertions, 0 deletions
diff --git a/traintestsplit b/traintestsplit
new file mode 100755
index 0000000..7ec52ae
--- /dev/null
+++ b/traintestsplit
@@ -0,0 +1,55 @@
+#!/usr/bin/env ruby
+
+require 'trollop'
+
+
+def main
+ opts = Trollop::options do
+ opt :foreign, "foreign file", :type => :string, :required => true
+ opt :english, "english file", :type => :string, :required => true
+ opt :size, "one size", :type => :int, :required => true
+ opt :repeat, "number of repetitions", :type => :int, :default => 1
+ opt :prefix, "prefix for output files", :type => :string
+ end
+ fn = opts[:foreign]
+ fn_ext = fn.split('.').last
+ f = File.new(fn, 'r').readlines
+ en = opts[:english]
+ en_ext = en.split('.').last
+ e = File.new(en, 'r').readlines
+ size = opts[:size]
+ nlines_f = `wc -l #{fn}`.split()[0].to_i
+ nlines_e = `wc -l #{en}`.split()[0].to_i
+ if nlines_f!=nlines_e
+ STDERR.write "Unbalanced files (#{nlines_f} vs. #{nlines_e}), exiting!\n"
+ exit 1
+ end
+
+ prefix = opts[:prefix]
+ a = (0..nlines_e-1).to_a
+ i = 0
+ opts[:repeat].times {
+ b = a.sample(size)
+ ax = a.reject{|j| b.include? j}
+ `mkdir split_#{i}`
+ new_f = File.new "split_#{i}/#{prefix}.train.#{i}.#{fn_ext}", 'w+'
+ new_e = File.new "split_#{i}/#{prefix}.train.#{i}.#{en_ext}", 'w+'
+ ax.each { |j|
+ new_f.write f[j]
+ new_e.write e[j]
+ }
+ new_f.close; new_e.close
+ new_f = File.new "split_#{i}/#{prefix}.test.#{i}.#{fn_ext}", 'w+'
+ new_e = File.new "split_#{i}/#{prefix}.test.#{i}.#{en_ext}", 'w+'
+ b.each { |j|
+ new_f.write f[j]
+ new_e.write e[j]
+ }
+ new_f.close; new_e.close
+ i += 1
+ }
+end
+
+
+main
+