#!/usr/bin/env ruby require 'zipf' require 'optimist' conf = Optimist::options do opt :source, "source file", :type => :string, :required => true opt :target, "target file", :type => :string, :required => true opt :size, "one size", :type => :int, :required => true opt :repeat, "number of repetitions", :type => :int, :default => 1 opt :prefix, "prefix for output files", :type => :string, :default => "split" opt :sets, "number of sets", :type => :int, :default => 1 end source_filename = conf[:source] source_extension = source_filename.split('.').last source_lines = ReadFile.readlines source_filename target_filename = conf[:target] target_extension = target_filename.split('.').last target_lines = ReadFile.readlines target_filename size = conf[:size] if source_lines.size != target_lines.size STDERR.write "Unbalanced files (#{source_lines.size} vs. #{target_lines.size}), exiting!\n" exit 1 end index = (0..source_lines.size-1).to_a conf[:repeat].times { |i| `mkdir split_#{i}` sampled = index.sample(size * conf[:sets]) test_strings_source = {} test_strings_target = {} conf[:sets].times { |s| slice_start_index = (s-1) * size source_file = WriteFile.new "split_#{i}/#{conf[:prefix]}.devtest.#{s}.#{source_extension}" target_file = WriteFile.new "split_#{i}/#{conf[:prefix]}.devtest.#{s}.#{target_extension}" sampled.slice(slice_start_index, size).each { |j| source_file.write source_lines[j] target_file.write target_lines[j] test_strings_source[source_lines[j].downcase] = true test_strings_target[target_lines[j].downcase] = true } source_file.close; target_file.close } filtered_index = index.reject{ |j| sampled.include? j } source_file = WriteFile.new "split_#{i}/#{conf[:prefix]}.train.#{source_extension}" target_file = WriteFile.new "split_#{i}/#{conf[:prefix]}.train.#{target_extension}" filtered_index.each { |j| if not test_strings_source.include? source_lines[j].downcase \ and not test_strings_target.include? target_lines[j] source_file.write source_lines[j] target_file.write target_lines[j] end } source_file.close; target_file.close i += 1 }