diff options
author | Patrick Simianer <p@simianer.de> | 2014-01-29 19:14:08 +0100 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2014-01-29 19:14:08 +0100 |
commit | 68acbb9a0c7967cb90a7e3756fc94fdd8a73d154 (patch) | |
tree | 3b445131dcb203e94473ae1d8aa82a1798585276 /traintestsplit | |
parent | 49158e721bfaf6423dca9fc633873218f691c83a (diff) |
make use of nlp_ruby, LICENSE
Diffstat (limited to 'traintestsplit')
-rwxr-xr-x | traintestsplit | 90 |
1 files changed, 43 insertions, 47 deletions
diff --git a/traintestsplit b/traintestsplit index 7ec52ae..7cc5bcf 100755 --- a/traintestsplit +++ b/traintestsplit @@ -1,55 +1,51 @@ #!/usr/bin/env ruby +require 'nlp_ruby' require 'trollop' -def main - opts = Trollop::options do - opt :foreign, "foreign file", :type => :string, :required => true - opt :english, "english file", :type => :string, :required => true - opt :size, "one size", :type => :int, :required => true - opt :repeat, "number of repetitions", :type => :int, :default => 1 - opt :prefix, "prefix for output files", :type => :string - end - fn = opts[:foreign] - fn_ext = fn.split('.').last - f = File.new(fn, 'r').readlines - en = opts[:english] - en_ext = en.split('.').last - e = File.new(en, 'r').readlines - size = opts[:size] - nlines_f = `wc -l #{fn}`.split()[0].to_i - nlines_e = `wc -l #{en}`.split()[0].to_i - if nlines_f!=nlines_e - STDERR.write "Unbalanced files (#{nlines_f} vs. #{nlines_e}), exiting!\n" - exit 1 - end - - prefix = opts[:prefix] - a = (0..nlines_e-1).to_a - i = 0 - opts[:repeat].times { - b = a.sample(size) - ax = a.reject{|j| b.include? j} - `mkdir split_#{i}` - new_f = File.new "split_#{i}/#{prefix}.train.#{i}.#{fn_ext}", 'w+' - new_e = File.new "split_#{i}/#{prefix}.train.#{i}.#{en_ext}", 'w+' - ax.each { |j| - new_f.write f[j] - new_e.write e[j] - } - new_f.close; new_e.close - new_f = File.new "split_#{i}/#{prefix}.test.#{i}.#{fn_ext}", 'w+' - new_e = File.new "split_#{i}/#{prefix}.test.#{i}.#{en_ext}", 'w+' - b.each { |j| - new_f.write f[j] - new_e.write e[j] - } - new_f.close; new_e.close - i += 1 - } +cfg = Trollop::options do + opt :foreign, "foreign file", :type => :string, :required => true + opt :english, "english file", :type => :string, :required => true + opt :size, "one size", :type => :int, :required => true + opt :repeat, "number of repetitions", :type => :int, :default => 1 + opt :prefix, "prefix for output files", :type => :string +end +fn = cfg[:foreign] +fn_ext = fn.split('.').last +f = ReadFile.new(fn).readlines +en = cfg[:english] +en_ext = en.split('.').last +e = ReadFile(en).readlines +size = cfg[:size] +nlines_f = `wc -l #{fn}`.split()[0].to_i +nlines_e = `wc -l #{en}`.split()[0].to_i +if nlines_f != nlines_e + STDERR.write "Unbalanced files (#{nlines_f} vs. #{nlines_e}), exiting!\n" + exit 1 end - -main +prefix = cfg[:prefix] +a = (0..nlines_e-1).to_a +i = 0 +cfg[:repeat].times { + b = a.sample(size) + ax = a.reject{|j| b.include? j} + `mkdir split_#{i}` + new_f = WriteFile.new("split_#{i}/#{prefix}.train.#{i}.#{fn_ext}") + new_e = WriteFile.new("split_#{i}/#{prefix}.train.#{i}.#{en_ext}") + ax.each { |j| + new_f.write f[j] + new_e.write e[j] + } + new_f.close; new_e.close + new_f = WriteFile.new("split_#{i}/#{prefix}.test.#{i}.#{fn_ext}") + new_e = WriteFile.new("split_#{i}/#{prefix}.test.#{i}.#{en_ext}") + b.each { |j| + new_f.write f[j] + new_e.write e[j] + } + new_f.close; new_e.close + i += 1 +} |