diff options
author | Patrick Simianer <p@simianer.de> | 2016-07-05 11:01:46 +0200 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2016-07-05 11:01:46 +0200 |
commit | 2b1d7f881c19c4d4b5afae194e02d3300c7675d0 (patch) | |
tree | 5a06ee7de98640a39244b57bb369697176b44ebf /train_test_split | |
parent | 69949dda35c3ea21d8e926e5f0a596a0a0f61c6a (diff) |
mv
Diffstat (limited to 'train_test_split')
-rwxr-xr-x | train_test_split | 50 |
1 files changed, 0 insertions, 50 deletions
diff --git a/train_test_split b/train_test_split deleted file mode 100755 index 4d8153a..0000000 --- a/train_test_split +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env ruby - -require 'zipf' -require 'trollop' - -conf = Trollop::options do - opt :foreign, "foreign file", :type => :string, :required => true - opt :english, "english file", :type => :string, :required => true - opt :size, "one size", :type => :int, :required => true - opt :repeat, "number of repetitions", :type => :int, :default => 1 - opt :prefix, "prefix for output files", :type => :string -end -fn = conf[:foreign] -fn_ext = fn.split('.').last -f = ReadFile.readlines fn -en = conf[:english] -en_ext = en.split('.').last -e = ReadFile.readlines en -size = conf[:size] -nlines_f = `wc -l #{fn}`.split()[0].to_i -nlines_e = `wc -l #{en}`.split()[0].to_i -if nlines_f != nlines_e - STDERR.write "Unbalanced files (#{nlines_f} vs. #{nlines_e}), exiting!\n" - exit 1 -end - -prefix = conf[:prefix] -a = (0..nlines_e-1).to_a -i = 0 -conf[:repeat].times { - b = a.sample(size) - ax = a.reject{|j| b.include? j} - `mkdir split_#{i}` - new_f = WriteFile.new "split_#{i}/#{prefix}.train.#{i}.#{fn_ext}" - new_e = WriteFile.new "split_#{i}/#{prefix}.train.#{i}.#{en_ext}" - ax.each { |j| - new_f.write f[j] - new_e.write e[j] - } - new_f.close; new_e.close - new_f = WriteFile.new "split_#{i}/#{prefix}.test.#{i}.#{fn_ext}" - new_e = WriteFile.new "split_#{i}/#{prefix}.test.#{i}.#{en_ext}" - b.each { |j| - new_f.write f[j] - new_e.write e[j] - } - new_f.close; new_e.close - i += 1 -} - |