diff options
| author | Patrick Simianer <p@simianer.de> | 2013-12-11 20:29:42 +0100 | 
|---|---|---|
| committer | Patrick Simianer <p@simianer.de> | 2013-12-11 20:29:42 +0100 | 
| commit | 820c06de59307ad06557a603ffee08679fcf3e43 (patch) | |
| tree | 40e95e04d66c4dd184aca7553960e12297d0ea4a /traintestsplit | |
| parent | 1dcf5842ce8a684a4943d3a67a8eb1c210359851 (diff) | |
train test split script
Diffstat (limited to 'traintestsplit')
| -rwxr-xr-x | traintestsplit | 55 | 
1 files changed, 55 insertions, 0 deletions
diff --git a/traintestsplit b/traintestsplit new file mode 100755 index 0000000..7ec52ae --- /dev/null +++ b/traintestsplit @@ -0,0 +1,55 @@ +#!/usr/bin/env ruby + +require 'trollop' + + +def main +  opts = Trollop::options do +    opt :foreign, "foreign file", :type => :string, :required => true +    opt :english, "english file", :type => :string, :required => true +    opt :size, "one size", :type => :int, :required => true +    opt :repeat, "number of repetitions", :type => :int, :default => 1 +    opt :prefix, "prefix for output files", :type => :string +  end +  fn = opts[:foreign] +  fn_ext = fn.split('.').last +  f = File.new(fn, 'r').readlines +  en = opts[:english] +  en_ext = en.split('.').last +  e = File.new(en, 'r').readlines +  size = opts[:size] +  nlines_f = `wc -l #{fn}`.split()[0].to_i +  nlines_e = `wc -l #{en}`.split()[0].to_i +  if nlines_f!=nlines_e  +    STDERR.write "Unbalanced files (#{nlines_f} vs. #{nlines_e}), exiting!\n" +    exit 1 +  end + +  prefix = opts[:prefix] +  a = (0..nlines_e-1).to_a +  i = 0 +  opts[:repeat].times { +    b = a.sample(size) +    ax = a.reject{|j| b.include? j} +    `mkdir split_#{i}` +    new_f = File.new "split_#{i}/#{prefix}.train.#{i}.#{fn_ext}", 'w+' +    new_e = File.new "split_#{i}/#{prefix}.train.#{i}.#{en_ext}", 'w+' +    ax.each { |j| +      new_f.write f[j] +      new_e.write e[j] +    } +    new_f.close; new_e.close +    new_f = File.new "split_#{i}/#{prefix}.test.#{i}.#{fn_ext}", 'w+' +    new_e = File.new "split_#{i}/#{prefix}.test.#{i}.#{en_ext}", 'w+' +    b.each { |j| +      new_f.write f[j] +      new_e.write e[j] +    } +    new_f.close; new_e.close +    i += 1 +  } +end + + +main +  | 
