summaryrefslogtreecommitdiff
path: root/train-test-split
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2016-07-05 11:01:46 +0200
committerPatrick Simianer <p@simianer.de>2016-07-05 11:01:46 +0200
commit2b1d7f881c19c4d4b5afae194e02d3300c7675d0 (patch)
tree5a06ee7de98640a39244b57bb369697176b44ebf /train-test-split
parent69949dda35c3ea21d8e926e5f0a596a0a0f61c6a (diff)
mv
Diffstat (limited to 'train-test-split')
-rwxr-xr-xtrain-test-split50
1 files changed, 50 insertions, 0 deletions
diff --git a/train-test-split b/train-test-split
new file mode 100755
index 0000000..4d8153a
--- /dev/null
+++ b/train-test-split
@@ -0,0 +1,50 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+require 'trollop'
+
+conf = Trollop::options do
+ opt :foreign, "foreign file", :type => :string, :required => true
+ opt :english, "english file", :type => :string, :required => true
+ opt :size, "one size", :type => :int, :required => true
+ opt :repeat, "number of repetitions", :type => :int, :default => 1
+ opt :prefix, "prefix for output files", :type => :string
+end
+fn = conf[:foreign]
+fn_ext = fn.split('.').last
+f = ReadFile.readlines fn
+en = conf[:english]
+en_ext = en.split('.').last
+e = ReadFile.readlines en
+size = conf[:size]
+nlines_f = `wc -l #{fn}`.split()[0].to_i
+nlines_e = `wc -l #{en}`.split()[0].to_i
+if nlines_f != nlines_e
+ STDERR.write "Unbalanced files (#{nlines_f} vs. #{nlines_e}), exiting!\n"
+ exit 1
+end
+
+prefix = conf[:prefix]
+a = (0..nlines_e-1).to_a
+i = 0
+conf[:repeat].times {
+ b = a.sample(size)
+ ax = a.reject{|j| b.include? j}
+ `mkdir split_#{i}`
+ new_f = WriteFile.new "split_#{i}/#{prefix}.train.#{i}.#{fn_ext}"
+ new_e = WriteFile.new "split_#{i}/#{prefix}.train.#{i}.#{en_ext}"
+ ax.each { |j|
+ new_f.write f[j]
+ new_e.write e[j]
+ }
+ new_f.close; new_e.close
+ new_f = WriteFile.new "split_#{i}/#{prefix}.test.#{i}.#{fn_ext}"
+ new_e = WriteFile.new "split_#{i}/#{prefix}.test.#{i}.#{en_ext}"
+ b.each { |j|
+ new_f.write f[j]
+ new_e.write e[j]
+ }
+ new_f.close; new_e.close
+ i += 1
+}
+