diff options
Diffstat (limited to 'traintestsplit')
-rwxr-xr-x | traintestsplit | 90 |
1 files changed, 43 insertions, 47 deletions
diff --git a/traintestsplit b/traintestsplit index 7ec52ae..7cc5bcf 100755 --- a/traintestsplit +++ b/traintestsplit @@ -1,55 +1,51 @@ #!/usr/bin/env ruby +require 'nlp_ruby' require 'trollop' -def main - opts = Trollop::options do - opt :foreign, "foreign file", :type => :string, :required => true - opt :english, "english file", :type => :string, :required => true - opt :size, "one size", :type => :int, :required => true - opt :repeat, "number of repetitions", :type => :int, :default => 1 - opt :prefix, "prefix for output files", :type => :string - end - fn = opts[:foreign] - fn_ext = fn.split('.').last - f = File.new(fn, 'r').readlines - en = opts[:english] - en_ext = en.split('.').last - e = File.new(en, 'r').readlines - size = opts[:size] - nlines_f = `wc -l #{fn}`.split()[0].to_i - nlines_e = `wc -l #{en}`.split()[0].to_i - if nlines_f!=nlines_e - STDERR.write "Unbalanced files (#{nlines_f} vs. #{nlines_e}), exiting!\n" - exit 1 - end - - prefix = opts[:prefix] - a = (0..nlines_e-1).to_a - i = 0 - opts[:repeat].times { - b = a.sample(size) - ax = a.reject{|j| b.include? j} - `mkdir split_#{i}` - new_f = File.new "split_#{i}/#{prefix}.train.#{i}.#{fn_ext}", 'w+' - new_e = File.new "split_#{i}/#{prefix}.train.#{i}.#{en_ext}", 'w+' - ax.each { |j| - new_f.write f[j] - new_e.write e[j] - } - new_f.close; new_e.close - new_f = File.new "split_#{i}/#{prefix}.test.#{i}.#{fn_ext}", 'w+' - new_e = File.new "split_#{i}/#{prefix}.test.#{i}.#{en_ext}", 'w+' - b.each { |j| - new_f.write f[j] - new_e.write e[j] - } - new_f.close; new_e.close - i += 1 - } +cfg = Trollop::options do + opt :foreign, "foreign file", :type => :string, :required => true + opt :english, "english file", :type => :string, :required => true + opt :size, "one size", :type => :int, :required => true + opt :repeat, "number of repetitions", :type => :int, :default => 1 + opt :prefix, "prefix for output files", :type => :string +end +fn = cfg[:foreign] +fn_ext = fn.split('.').last +f = ReadFile.new(fn).readlines +en = cfg[:english] +en_ext = en.split('.').last +e = ReadFile(en).readlines +size = cfg[:size] +nlines_f = `wc -l #{fn}`.split()[0].to_i +nlines_e = `wc -l #{en}`.split()[0].to_i +if nlines_f != nlines_e + STDERR.write "Unbalanced files (#{nlines_f} vs. #{nlines_e}), exiting!\n" + exit 1 end - -main +prefix = cfg[:prefix] +a = (0..nlines_e-1).to_a +i = 0 +cfg[:repeat].times { + b = a.sample(size) + ax = a.reject{|j| b.include? j} + `mkdir split_#{i}` + new_f = WriteFile.new("split_#{i}/#{prefix}.train.#{i}.#{fn_ext}") + new_e = WriteFile.new("split_#{i}/#{prefix}.train.#{i}.#{en_ext}") + ax.each { |j| + new_f.write f[j] + new_e.write e[j] + } + new_f.close; new_e.close + new_f = WriteFile.new("split_#{i}/#{prefix}.test.#{i}.#{fn_ext}") + new_e = WriteFile.new("split_#{i}/#{prefix}.test.#{i}.#{en_ext}") + b.each { |j| + new_f.write f[j] + new_e.write e[j] + } + new_f.close; new_e.close + i += 1 +} |