diff options
Diffstat (limited to 'traintestsplit')
| -rwxr-xr-x | traintestsplit | 51 | 
1 files changed, 0 insertions, 51 deletions
| diff --git a/traintestsplit b/traintestsplit deleted file mode 100755 index ec88df1..0000000 --- a/traintestsplit +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env ruby - -require 'zipf' -require 'trollop' - - -cfg = Trollop::options do -  opt :foreign, "foreign file", :type => :string, :required => true -  opt :english, "english file", :type => :string, :required => true -  opt :size, "one size", :type => :int, :required => true -  opt :repeat, "number of repetitions", :type => :int, :default => 1 -  opt :prefix, "prefix for output files", :type => :string -end -fn = cfg[:foreign] -fn_ext = fn.split('.').last -f = ReadFile.readlines fn -en = cfg[:english] -en_ext = en.split('.').last -e = ReadFile.readlines en -size = cfg[:size] -nlines_f = `wc -l #{fn}`.split()[0].to_i -nlines_e = `wc -l #{en}`.split()[0].to_i -if nlines_f != nlines_e -  STDERR.write "Unbalanced files (#{nlines_f} vs. #{nlines_e}), exiting!\n" -  exit 1 -end - -prefix = cfg[:prefix] -a = (0..nlines_e-1).to_a -i = 0 -cfg[:repeat].times { -  b = a.sample(size) -  ax = a.reject{|j| b.include? j} -  `mkdir split_#{i}` -  new_f = WriteFile.new "split_#{i}/#{prefix}.train.#{i}.#{fn_ext}" -  new_e = WriteFile.new "split_#{i}/#{prefix}.train.#{i}.#{en_ext}" -  ax.each { |j| -    new_f.write f[j] -    new_e.write e[j] -  } -  new_f.close; new_e.close -  new_f = WriteFile.new "split_#{i}/#{prefix}.test.#{i}.#{fn_ext}" -  new_e = WriteFile.new "split_#{i}/#{prefix}.test.#{i}.#{en_ext}" -  b.each { |j| -    new_f.write f[j] -    new_e.write e[j] -  } -  new_f.close; new_e.close -  i += 1 -} - | 
