summaryrefslogtreecommitdiff
path: root/train_test_split
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2016-07-05 11:01:46 +0200
committerPatrick Simianer <p@simianer.de>2016-07-05 11:01:46 +0200
commit2b1d7f881c19c4d4b5afae194e02d3300c7675d0 (patch)
tree5a06ee7de98640a39244b57bb369697176b44ebf /train_test_split
parent69949dda35c3ea21d8e926e5f0a596a0a0f61c6a (diff)
mv
Diffstat (limited to 'train_test_split')
-rwxr-xr-xtrain_test_split50
1 files changed, 0 insertions, 50 deletions
diff --git a/train_test_split b/train_test_split
deleted file mode 100755
index 4d8153a..0000000
--- a/train_test_split
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/usr/bin/env ruby
-
-require 'zipf'
-require 'trollop'
-
-conf = Trollop::options do
- opt :foreign, "foreign file", :type => :string, :required => true
- opt :english, "english file", :type => :string, :required => true
- opt :size, "one size", :type => :int, :required => true
- opt :repeat, "number of repetitions", :type => :int, :default => 1
- opt :prefix, "prefix for output files", :type => :string
-end
-fn = conf[:foreign]
-fn_ext = fn.split('.').last
-f = ReadFile.readlines fn
-en = conf[:english]
-en_ext = en.split('.').last
-e = ReadFile.readlines en
-size = conf[:size]
-nlines_f = `wc -l #{fn}`.split()[0].to_i
-nlines_e = `wc -l #{en}`.split()[0].to_i
-if nlines_f != nlines_e
- STDERR.write "Unbalanced files (#{nlines_f} vs. #{nlines_e}), exiting!\n"
- exit 1
-end
-
-prefix = conf[:prefix]
-a = (0..nlines_e-1).to_a
-i = 0
-conf[:repeat].times {
- b = a.sample(size)
- ax = a.reject{|j| b.include? j}
- `mkdir split_#{i}`
- new_f = WriteFile.new "split_#{i}/#{prefix}.train.#{i}.#{fn_ext}"
- new_e = WriteFile.new "split_#{i}/#{prefix}.train.#{i}.#{en_ext}"
- ax.each { |j|
- new_f.write f[j]
- new_e.write e[j]
- }
- new_f.close; new_e.close
- new_f = WriteFile.new "split_#{i}/#{prefix}.test.#{i}.#{fn_ext}"
- new_e = WriteFile.new "split_#{i}/#{prefix}.test.#{i}.#{en_ext}"
- b.each { |j|
- new_f.write f[j]
- new_e.write e[j]
- }
- new_f.close; new_e.close
- i += 1
-}
-