diff options
Diffstat (limited to 'training/dtrain/parallelize.rb')
-rwxr-xr-x | training/dtrain/parallelize.rb | 35 |
1 files changed, 10 insertions, 25 deletions
diff --git a/training/dtrain/parallelize.rb b/training/dtrain/parallelize.rb index 5fc8b04e..fe3a6cf5 100755 --- a/training/dtrain/parallelize.rb +++ b/training/dtrain/parallelize.rb @@ -1,10 +1,11 @@ #!/usr/bin/env ruby require 'trollop' +require 'zipf' def usage STDERR.write "Usage: " - STDERR.write "ruby parallelize.rb -c <dtrain.ini> [-e <epochs=10>] [--randomize/-z] [--reshard/-y] -s <#shards|0> [-p <at once=9999>] -i <input> -r <refs> [--qsub/-q] [--dtrain_binary <path to dtrain binary>] [-l \"l2 select_k 100000\"] [--extra_qsub \"-l mem_free=24G\"]\n" + STDERR.write "ruby parallelize.rb -c <dtrain.ini> [-e <epochs=10>] [--randomize/-z] [--reshard/-y] -s <#shards|0> [-p <at once=9999>] -i <input> [--qsub/-q] [--dtrain_binary <path to dtrain binary>] [-l \"l2 select_k 100000\"] [--extra_qsub \"-l mem_free=24G\"]\n" exit 1 end @@ -16,15 +17,14 @@ opts = Trollop::options do opt :reshard, "reshard after each epoch", :type => :bool, :short => '-y', :default => false opt :shards, "number of shards", :type => :int opt :processes_at_once, "have this number (max) running at the same time", :type => :int, :default => 9999 - opt :input, "input", :type => :string - opt :references, "references", :type => :string + opt :input, "input (bitext f ||| e ||| ...)", :type => :string opt :qsub, "use qsub", :type => :bool, :default => false opt :dtrain_binary, "path to dtrain binary", :type => :string opt :extra_qsub, "extra qsub args", :type => :string, :default => "" opt :per_shard_decoder_configs, "give special decoder config per shard", :type => :string, :short => '-o' opt :first_input_weights, "input weights for first iter", :type => :string, :default => '', :short => '-w' end -usage if not opts[:config]&&opts[:shards]&&opts[:input]&&opts[:references] +usage if not opts[:config]&&opts[:shards]&&opts[:input] dtrain_dir = File.expand_path File.dirname(__FILE__) if not opts[:dtrain_binary] @@ -51,7 +51,6 @@ else num_shards = opts[:shards] end input = opts[:input] -refs = opts[:references] use_qsub = opts[:qsub] shards_at_once = opts[:processes_at_once] first_input_weights = opts[:first_input_weights] @@ -59,7 +58,7 @@ opts[:extra_qsub] = "-l #{opts[:extra_qsub]}" if opts[:extra_qsub]!="" `mkdir work` -def make_shards(input, refs, num_shards, epoch, rand) +def make_shards(input, num_shards, epoch, rand) lc = `wc -l #{input}`.split.first.to_i index = (0..lc-1).to_a index.reverse! @@ -69,12 +68,8 @@ def make_shards(input, refs, num_shards, epoch, rand) leftover = 0 if leftover < 0 in_f = File.new input, 'r' in_lines = in_f.readlines - refs_f = File.new refs, 'r' - refs_lines = refs_f.readlines shard_in_files = [] - shard_refs_files = [] in_fns = [] - refs_fns = [] new_num_shards = 0 0.upto(num_shards-1) { |shard| break if index.size==0 @@ -82,41 +77,32 @@ def make_shards(input, refs, num_shards, epoch, rand) in_fn = "work/shard.#{shard}.#{epoch}.in" shard_in = File.new in_fn, 'w+' in_fns << in_fn - refs_fn = "work/shard.#{shard}.#{epoch}.refs" - shard_refs = File.new refs_fn, 'w+' - refs_fns << refs_fn 0.upto(shard_sz-1) { |i| j = index.pop break if !j shard_in.write in_lines[j] - shard_refs.write refs_lines[j] } shard_in_files << shard_in - shard_refs_files << shard_refs } while leftover > 0 j = index.pop shard_in_files[-1].write in_lines[j] - shard_refs_files[-1].write refs_lines[j] leftover -= 1 end - (shard_in_files + shard_refs_files).each do |f| f.close end + shard_in_files.each do |f| f.close end in_f.close - refs_f.close - return in_fns, refs_fns, new_num_shards + return in_fns, new_num_shards end input_files = [] -refs_files = [] if predefined_shards input_files = File.new(input).readlines.map {|i| i.strip } - refs_files = File.new(refs).readlines.map {|i| i.strip } if per_shard_decoder_configs decoder_configs = File.new(opts[:per_shard_decoder_configs]).readlines.map {|i| i.strip} end num_shards = input_files.size else - input_files, refs_files, num_shards = make_shards input, refs, num_shards, 0, rand + input_files, num_shards = make_shards input, num_shards, 0, rand end 0.upto(epochs-1) { |epoch| @@ -149,8 +135,7 @@ end end pids << Kernel.fork { `#{qsub_str_start}#{dtrain_bin} -c #{ini} #{cdec_cfg} #{input_weights}\ - --input #{input_files[shard]}\ - --refs #{refs_files[shard]}\ + --bitext #{input_files[shard]}\ --output work/weights.#{shard}.#{epoch}#{qsub_str_end} #{local_end}` } weights_files << "work/weights.#{shard}.#{epoch}" @@ -163,7 +148,7 @@ end `#{cat} work/weights.*.#{epoch} > work/weights_cat` `#{ruby} #{lplp_rb} #{lplp_args} #{num_shards} < work/weights_cat > work/weights.#{epoch}` if rand and reshard and epoch+1!=epochs - input_files, refs_files, num_shards = make_shards input, refs, num_shards, epoch+1, rand + input_files, num_shards = make_shards input, num_shards, epoch+1, rand end } |