#!/usr/bin/env ruby

require 'zipf'
require 'optimist'

conf = Optimist::options do
  opt :source, "source file", :type => :string, :required => true
  opt :target, "target file", :type => :string, :required => true
  opt :size, "one size", :type => :int, :required => true
  opt :repeat, "number of repetitions", :type => :int, :default => 1
  opt :prefix, "prefix for output files", :type => :string, :default => "split"
  opt :sets, "number of sets", :type => :int, :default => 1
end

source_filename = conf[:source]
source_extension = source_filename.split('.').last
source_lines = ReadFile.readlines source_filename

target_filename = conf[:target]
target_extension = target_filename.split('.').last
target_lines = ReadFile.readlines target_filename

size = conf[:size]

if source_lines.size != target_lines.size
  STDERR.write "Unbalanced files (#{source_lines.size} vs. #{target_lines.size}), exiting!\n"
  exit 1
end

index = (0..source_lines.size-1).to_a
conf[:repeat].times { |i|
  `mkdir split_#{i}`

  sampled = index.sample(size * conf[:sets])

  test_strings_source = {}
  test_strings_target = {}

  conf[:sets].times { |s|
    slice_start_index = (s-1) * size

    source_file = WriteFile.new "split_#{i}/#{conf[:prefix]}.devtest.#{s}.#{source_extension}"
    target_file = WriteFile.new "split_#{i}/#{conf[:prefix]}.devtest.#{s}.#{target_extension}"

    sampled.slice(slice_start_index, size).each { |j|
      source_file.write source_lines[j]
      target_file.write target_lines[j]
      test_strings_source[source_lines[j].downcase] = true
      test_strings_target[target_lines[j].downcase] = true
    }
    source_file.close; target_file.close
  }

  filtered_index = index.reject{ |j| sampled.include? j }
  source_file = WriteFile.new "split_#{i}/#{conf[:prefix]}.train.#{source_extension}"
  target_file = WriteFile.new "split_#{i}/#{conf[:prefix]}.train.#{target_extension}"
  filtered_index.each { |j|
    if not test_strings_source.include? source_lines[j].downcase \
        and not test_strings_target.include? target_lines[j]
      source_file.write source_lines[j]
      target_file.write target_lines[j]
    end
  }
  source_file.close; target_file.close

  i += 1
}