summaryrefslogtreecommitdiff
path: root/traintestsplit
diff options
context:
space:
mode:
Diffstat (limited to 'traintestsplit')
-rwxr-xr-xtraintestsplit51
1 files changed, 0 insertions, 51 deletions
diff --git a/traintestsplit b/traintestsplit
deleted file mode 100755
index ec88df1..0000000
--- a/traintestsplit
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/usr/bin/env ruby
-
-require 'zipf'
-require 'trollop'
-
-
-cfg = Trollop::options do
- opt :foreign, "foreign file", :type => :string, :required => true
- opt :english, "english file", :type => :string, :required => true
- opt :size, "one size", :type => :int, :required => true
- opt :repeat, "number of repetitions", :type => :int, :default => 1
- opt :prefix, "prefix for output files", :type => :string
-end
-fn = cfg[:foreign]
-fn_ext = fn.split('.').last
-f = ReadFile.readlines fn
-en = cfg[:english]
-en_ext = en.split('.').last
-e = ReadFile.readlines en
-size = cfg[:size]
-nlines_f = `wc -l #{fn}`.split()[0].to_i
-nlines_e = `wc -l #{en}`.split()[0].to_i
-if nlines_f != nlines_e
- STDERR.write "Unbalanced files (#{nlines_f} vs. #{nlines_e}), exiting!\n"
- exit 1
-end
-
-prefix = cfg[:prefix]
-a = (0..nlines_e-1).to_a
-i = 0
-cfg[:repeat].times {
- b = a.sample(size)
- ax = a.reject{|j| b.include? j}
- `mkdir split_#{i}`
- new_f = WriteFile.new "split_#{i}/#{prefix}.train.#{i}.#{fn_ext}"
- new_e = WriteFile.new "split_#{i}/#{prefix}.train.#{i}.#{en_ext}"
- ax.each { |j|
- new_f.write f[j]
- new_e.write e[j]
- }
- new_f.close; new_e.close
- new_f = WriteFile.new "split_#{i}/#{prefix}.test.#{i}.#{fn_ext}"
- new_e = WriteFile.new "split_#{i}/#{prefix}.test.#{i}.#{en_ext}"
- b.each { |j|
- new_f.write f[j]
- new_e.write e[j]
- }
- new_f.close; new_e.close
- i += 1
-}
-