From 8f3b6a8889bc1b8a18f14e947360e0a8bee808b7 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Wed, 8 Jan 2014 18:11:48 +0100 Subject: too much to say --- rampfion.rb | 657 +++++++++++++++++++++++++++++++++++++++++++++++ rampion_with_feedback.rb | 558 ---------------------------------------- 2 files changed, 657 insertions(+), 558 deletions(-) create mode 100755 rampfion.rb delete mode 100755 rampion_with_feedback.rb diff --git a/rampfion.rb b/rampfion.rb new file mode 100755 index 0000000..ce40917 --- /dev/null +++ b/rampfion.rb @@ -0,0 +1,657 @@ +#!/usr/bin/env ruby + +require 'trollop' +require 'tempfile' +require 'open3' +require 'memcached' +require 'timeout' + + +SMT_SEMPARSE = 'python /workspace/grounded/smt-semparse-cp/decode_sentence.py /workspace/grounded/smt-semparse-cp/working/full_dataset 2>/dev/null' +EVAL_PL = '/workspace/grounded/wasp-1.0/data/geo-funql/eval/eval.pl' +CDEC = "/toolbox/cdec-dtrain/bin/cdec" + +$cache = Memcached.new("localhost:11211") + +# the semantic parser hangs sometimes +def spawn_with_timeout cmd, t=4, debug=false + puts cmd if debug + pipe_in, pipe_out = IO.pipe + pid = Process.spawn(cmd, :out => pipe_out) + begin + Timeout.timeout(t) { Process.wait pid } + rescue Timeout::Error + return "" + # accept the zombies + #Process.kill('TERM', pid) + end + pipe_out.close + return pipe_in.read +end + +# execute +def exec natural_language_string, reference_output, no_output=false + func = nil + output = nil + feedback = nil + key_prefix = natural_language_string.encode("ASCII", :invalid => :replace, :undef => :replace, :replace => "?").gsub(/ /,'_') + begin + func = $cache.get key_prefix+"__FUNC" + output = $cache.get key_prefix+"__OUTPUT" + feedback = $cache.get key_prefix+"__FEEDBACK" + rescue Memcached::NotFound + func = spawn_with_timeout("#{SMT_SEMPARSE} \"#{natural_language_string}\"").strip + output = spawn_with_timeout("echo \"execute_funql_query(#{func}, X).\" | swipl -s #{EVAL_PL} 2>&1 | grep \"X =\"").strip.split('X = ')[1] + feedback = output==reference_output + begin + $cache.set key_prefix+"__FUNC", func + $cache.set key_prefix+"__OUTPUT", output + $cache.set key_prefix+"__FEEDBACK", feedback + rescue SystemExit, Interrupt + $cache.delete key_prefix+"__FUNC" + $cache.delete key_prefix+"__OUTPUT" + $cache.delete key_prefix+"__FEEDBACK" + end + end + puts " nrl: #{natural_language_string}" if !no_output + puts " mrl: #{func}" if !no_output + puts " output: #{output}" if !no_output + puts " correct?: #{feedback}" if !no_output + return feedback, func, output +end + +# decoder interaction/translations +class Translation + attr_accessor :s, :f, :rank, :model, :score + + def initialize kbest_line, rank=-1 + a = kbest_line.split ' ||| ' + @s = a[1].strip + h = {} + a[2].split.each { |i| + name, value = i.split '=' + value = value.to_f + h[name] = value + } + @f = NamedSparseVector.new h + @rank = rank + @model = a[3].to_f + @score = -1.0 + end + + def to_s + "#{@rank} ||| #{@s} ||| #{@model} ||| #{@score} ||| #{@f.to_s}" + end +end + +def predict_translation s, k, ini, w + o, s = Open3.capture2 "echo \"#{s}\" | #{CDEC} -c #{ini} -r -k #{k} -w #{w} 2>/dev/null" + j = -1 + return o.split("\n").map{|i| j+=1; Translation.new(i, j)} +end + +# scoring (per-sentence BLEU) +def ngrams_it(s, n, fix=false) + a = s.strip.split + a.each_with_index { |tok, i| + tok.strip! + 0.upto([n-1, a.size-i-1].min) { |m| + yield a[i..i+m] if !(fix||(a[i..i+m].size>n)) + } + } +end + +def brevity_penalty hypothesis, reference + a = hypothesis.split; b = reference.split + return 1.0 if a.size>b.size + return Math.exp(1.0 - b.size.to_f/a.size); +end + +def per_sentence_bleu hypothesis, reference, n=4 + h_ng = {}; r_ng = {} + (1).upto(n) {|i| h_ng[i] = []; r_ng[i] = []} + ngrams_it(hypothesis, n) {|i| h_ng[i.size] << i} + ngrams_it(reference, n) {|i| r_ng[i.size] << i} + m = [n, reference.split.size].min + weight = 1.0/m + add = 0.0 + sum = 0 + (1).upto(m) { |i| + counts_clipped = 0 + counts_sum = h_ng[i].size + h_ng[i].uniq.each {|j| counts_clipped += r_ng[i].count(j)} + add = 1.0 if i >= 2 + sum += weight * Math.log((counts_clipped + add)/(counts_sum + add)); + } + return brevity_penalty(hypothesis, reference) * Math.exp(sum) +end + +def score_translations list_of_translations, reference + list_of_translations.each { |i| i.score = per_sentence_bleu i.s, reference} +end + +# hope and fear +def hope_and_fear kbest, action + max = -1.0/0 + max_idx = -1 + kbest.each_with_index { |i,j| + if action=='hope' && i.model + i.score > max + max_idx = j; max = i.model + i.score + end + if action=='fear' && i.model - i.score > max + max_idx = j; max = i.model - i.score + end + } + return kbest[max_idx] +end + +# update +def update w, hope, fear, eta + diff = hope.f - fear.f + diff *= eta + w += diff + return w +end + +# weights +class NamedSparseVector + attr_accessor :h + + def initialize init=nil + @h = {} + @h = init if init + @h.default = 0.0 + end + + def + other + new_h = Hash.new + new_h.update @h + ret = NamedSparseVector.new new_h + other.each_pair { |k,v| ret[k]+=v } + return ret + end + + def from_file fn + f = File.new(fn, 'r') + while line = f.gets + name, value = line.strip.split + value = value.to_f + @h[name] = value + end + end + + def to_file + s = [] + @h.each_pair { |k,v| s << "#{k} #{v}" } + s.join("\n")+"\n" + end + + def - other + new_h = Hash.new + new_h.update @h + ret = NamedSparseVector.new new_h + other.each_pair { |k,v| ret[k]-=v } + return ret + end + + def * scalar + raise ArgumentError, "Arg is not numeric #{scalar}" unless scalar.is_a? Numeric + ret = NamedSparseVector.new + @h.keys.each { |k| ret[k] = @h[k]*scalar } + return ret + end + + def dot other + sum = 0.0 + @h.each_pair { |k,v| + sum += v * other[k] + } + return sum + end + + def [] k + @h[k] + end + + def []= k, v + @h[k] = v + end + + def each_pair + @h.each_pair { |k,v| yield k,v } + end + + def to_s + @h.to_s + end + + def length + Math.sqrt(@h.values.map{|i|i*i}.inject(:+)) + end + + def normalize! + l = length + @h.each_pair { |k,v| + @h[k] = v/l + } + end + + def size + @h.keys.size + end +end + +# map models score to [0,1] +def adj_model kbest, factor + min = kbest.map{|i|i.model}.min + max = kbest.map{|i|i.model}.max + kbest.each {|i| i.model = factor*((i.model-min)/(max-min))} +end + +class Stats + def initialize name + @name = name + @with_parse = 0.0 + @with_output = 0.0 + @correct_output = 0.0 + end + + def update feedback, func, output + @with_parse +=1 if func!="None"&&func!='' + @with_output +=1 if output!="null"&&output!='' + @correct_output += 1 if feedback==true + end + + def print total + without_parse = total-@with_parse +<<-eos + [#{@name}] + #{@name} with parse #{((@with_parse/total)*100).round 2} abs:#{@with_parse} + #{@name} with output #{((@with_output/total)*100).round 2} abs:#{@with_output} +#{@name} with correct output #{((@correct_output/total)*100).round 2} adj:#{((@correct_output/(total-without_parse))*100).round 2} abs:#{@correct_output} +eos + end +end + +def _print rank, string, model, score + puts "rank=#{rank} string='#{string}' model=#{model} score=#{score}" +end + +def bag_of_words s, stopwords=[] + s.split.uniq.sort.reject{|v| stopwords.include? v} +end + +def gethopefear_standard kbest, feedback + hope = fear = nil + type1 = type2 = false + if feedback == true + hope = kbest[0] + type1 = true + else + hope = hope_and_fear(kbest, 'hope') + type2 = true + end + fear = hope_and_fear(kbest, 'fear') + return hope, fear, false, type1, type2 +end + +def gethopefear_fear_no_exec kbest, feedback, gold, max + hope = fear = nil + type1 = type2 = false + if feedback == true + hope = kbest[0] + type1 = true + else + hope = hope_and_fear(kbest, 'hope') + type2 = true + end + kbest.sort{|x,y|(y.model+y.score)<=>(x.model+x.score)}.each_with_index { |k,i| + break if i==max + if !exec(k.s, gold, true)[0] + fear = k + break + end + } + skip=true if !fear + return hope, fear, skip, type1, type2 +end + +def gethopefear_fear_no_exec_skip kbest, feedback, gold + hope = fear = nil + type1 = type2 = false + if feedback == true + hope = kbest[0] + type1 = true + else + hope = hope_and_fear(kbest, 'hope') + type2 = true + end + fear = hope_and_fear(kbest, 'fear') + skip = exec(fear.s, gold, true)[0] + return hope, fear, skip, type1, type2 +end + +def gethopefear_fear_no_exec_hope_exec kbest, feedback, gold, max + hope = fear = nil; hope_idx = 0 + type1 = type2 = false + sorted_kbest = kbest.sort{|x,y|(y.model+y.score)<=>(x.model+x.score)} + if feedback == true + hope = kbest[0] + type1 = true + else + sorted_kbest.each_with_index { |k,i| + next if i==0 + break if i==max + if exec(k.s, gold, true)[0] + hope_idx = i + hope = k + break + end + } + type2 = true + end + sorted_kbest.each_with_index { |k,i| + break if i>(kbest.size-(hope_idx+1))||i==max + if !exec(k.s, gold, true)[0] + fear = k + break + end + } + skip = true if !hope||!fear + return hope, fear, skip, type1, type2 +end + +def gethopefear_only_exec kbest, feedback, gold, max, own_reference=nil + hope = fear = nil; hope_idx = 0; new_reference = nil + type1 = type2 = false + if feedback == true + hope = kbest[0] + new_reference = hope + type1 = true + elsif own_reference + hope = own_reference + type1 = true + else + kbest.each_with_index { |k,i| + next if i==0 + break if i==max + if exec(k.s, gold, true)[0] + hope_idx = i + hope = k + break + end + } + type2 = true + end + kbest.each_with_index { |k,i| + next if i==0||i==hope_idx + break if i==max + if !exec(k.s, gold, true)[0] + fear = k + break + end + } + skip = true if !hope||!fear + return hope, fear, skip, type1, type2, new_reference +end + +def gethopefear_only_exec_simple kbest, feedback, gold, max, own_reference=nil + hope = fear = nil; hope_idx = 0; new_reference = nil + type1 = type2 = false + if feedback == true + hope = kbest[0] + new_reference = hope + type1 = true + elsif own_reference + hope = own_reference + type1 = true + else + kbest.each_with_index { |k,i| + next if i==0 + break if i==max + if exec(k.s, gold, true)[0] + hope_idx = i + hope = k + break + end + } + type2 = true + end + kbest.each_with_index { |k,i| + next if i==0||i==hope_idx + break if i==max + if !exec(k.s, gold, true)[0] + fear = k + break + end + } + skip = true if !hope||!fear + return hope, fear, skip, type1, type2, new_reference +end + +def gethopefear_rampion kbest, reference + hope = fear = nil + type1 = type2 = false + if kbest[0].s == reference + hope = kbest[0] + fear = hope_and_fear(kbest, 'fear') + type1 = true + else + hope = hope_and_fear(kbest, 'hope') + fear = kbest[0] + type2 = true + end + return hope, fear, false, type1, type2 +end + +def main + opts = Trollop::options do + # data + opt :k, "k", :type => :int, :default => 10000 + opt :hope_fear_max, "asdf", :type => :int, :default => 32, :short => '-q' + opt :input, "'foreign' input", :type => :string, :required => true + opt :references, "(parseable) references", :type => :string, :required => true + opt :gold, "gold output", :type => :string, :require => true + opt :gold_mrl, "gold parse", :type => :string, :short => '-h', :require => true + opt :init_weights, "initial weights", :type => :string, :required => true, :short => '-w' + opt :cdec_ini, "cdec config file", :type => :string, :default => './cdec.ini' + # output + opt :debug, "debug output", :type => :bool, :default => false + opt :output_weights, "output file for final weights", :type => :string, :required => true + opt :stop_after, "stop after x examples", :type => :int, :default => -1 + opt :print_kbests, "print full kbest lists", :type => :bool, :default => false, :short => '-l' + # important parameters + opt :eta, "learning rate", :type => :float, :default => 0.01 + opt :iterate, "iteration X epochs", :type => :int, :default => 1, :short => '-j' + opt :variant, "standard, rampion, fear_no_exec, fear_no_exec_skip, fear_no_exec_hope_exec, only_exec", :default => 'standard' + # misc parameters + opt :scale_model, "scale model score by this factor", :type => :float, :default => 1.0, :short => '-m' + opt :normalize, "normalize weights after each update", :type => :bool, :default => false, :short => '-n' + opt :skip_on_no_proper_gold, "skip if the reference didn't produce a proper gold output", :default => false, :short => '-x' + opt :no_update, "don't update weights", :type => :bool, :default => false, :short => '-y' + end + # output configuration + puts "cfg" + opts.each_pair {|k,v| puts "#{k}=#{v}"} + puts + # read files + input = File.readlines(opts[:input], :encoding=>'utf-8').map{|i|i.strip} + references = File.readlines(opts[:references], :encoding=>'utf-8').map{|i|i.strip} + gold = File.readlines(opts[:gold], :encoding=>'utf-8').map{|i|i.strip} + gold_mrl = File.readlines(opts[:gold_mrl], :encoding=>'utf-8').map{|i|i.strip} + stopwords = File.readlines('d/stopwords.en', :encoding=>'utf-8').map{|i|i.strip} + # only_exec: new refs + own_references = nil + own_references = references.map{|i|nil} if opts[:variant]== 'only_exec' + # init weights + w = NamedSparseVector.new + w.from_file opts[:init_weights] + last_wf = '' +# iterate +opts[:iterate].times { |iter| + # numerous counters + without_translations = 0 + no_proper_gold_output = 0 + count = 0 + top1_stats = Stats.new 'top1' + hope_stats = Stats.new 'hope' + fear_stats = Stats.new 'fear' + refs_stats = Stats.new 'refs' + type1_updates = 0 + type2_updates = 0 + top1_hit = 0 + top1_variant = 0 + top1_real_variant = 0 + hope_hit = 0 + hope_variant = 0 + hope_real_variant = 0 + kbest_sz = 0 + # for each example + input.each_with_index { |i,j| + count += 1 + # write current weights to file + tmp_file = Tempfile.new('rampion') + tmp_file_path = tmp_file.path + last_wf = tmp_file.path + tmp_file.write w.to_file + tmp_file.close + # get kbest list for current input + kbest = predict_translation i, opts[:k], opts[:cdec_ini], tmp_file_path + kbest_sz += kbest.size + # output + puts "EXAMPLE #{j}" + puts "GOLD MRL: #{gold_mrl[j]}" + puts "GOLD OUTPUT #{gold[j]}" + # skip if no translation could be produced + if kbest.size == 0 + without_translations += 1 + puts "NO MT OUTPUT, skipping example\n\n" + next + end + # no proper gold + if gold[j] == '[]' || gold[j] == '[...]' || gold[j] == '[].' + no_proper_gold_output += 1 + if opts[:skip_on_no_proper_gold] + puts "NO PROPER GOLD OUTPUT, skipping example\n\n" + next + end + end + # score kbest list + score_translations kbest, references[j] + # print kbest list + if opts[:print_kbests] + puts "<<>>" + end + # adjust model scores to fit in [0,1] + adj_model kbest, opts[:scale_model] + # top1 + puts "---top1" + puts "TOP1 TRANSLATION: #{kbest[0].s}" if iter+1==opts[:iterate] + _print 0, kbest[0].s, kbest[0].model, kbest[0].score + feedback, func, output = exec kbest[0].s, gold[j] + top1_stats.update feedback, func, output + # reference as bag of words + ref_words = bag_of_words references[j], stopwords + # hope and fear + hope = fear = new_reference = nil + type1 = type2 = skip = false + if opts[:variant] == 'standard' + hope, fear, skip, type1, type2 = gethopefear_standard kbest, feedback + elsif opts[:variant] == 'rampion' + hope, fear, skip, type1, type2 = gethopefear_rampion kbest, references[j] + elsif opts[:variant] == 'fear_no_exec_skip' + hope, fear, skip, type1, type2 = gethopefear_fear_no_exec_skip kbest, feedback, gold[j] + elsif opts[:variant] == 'fear_no_exec' + hope, fear, skip, type1, type2 = gethopefear_fear_no_exec kbest, feedback, gold[j], opts[:hope_fear_max] + elsif opts[:variant] == 'fear_no_exec_hope_exec' + hope, fear, skip, type1, type2 = gethopefear_fear_no_exec_hope_exec kbest, feedback, gold[j], opts[:hope_fear_max] + elsif opts[:variant] == 'only_exec' + hope, fear, skip, type1, type2, new_reference = gethopefear_only_exec kbest, feedback, gold[j], opts[:hope_fear_max], own_references[j] + else + puts "no such hope/fear variant" + exit 1 + end + # new reference (only_exec) + if new_reference + own_references[j] = new_reference + end + # type1/type2 + type1_updates+=1 if type1 + type2_updates+=1 if type2 + # top1/hope hit + if kbest[0].s == references[j] + top1_hit += 1 + else + top1_variant += 1 + top1_real_variant += 1 if bag_of_words(kbest[0].s,stopwords)!=ref_words + end + if hope&&hope.s == references[j] + hope_hit += 1 + elsif hope + hope_variant += 1 + hope_real_variant += 1 if bag_of_words(hope.s,stopwords)!=ref_words + end + # output info for current example + puts "---hope" + if hope + _print hope.rank, hope.s, hope.model, hope.score + feedback, func, output = exec hope.s, gold[j] + hope_stats.update feedback, func, output + end + puts "---fear" + if fear + _print fear.rank, fear.s, fear.model, fear.score + feedback, func, output = exec fear.s, gold[j] + fear_stats.update feedback, func, output + end + puts "---reference" + _print 'x', references[j], 'x', 1.0 + feedback, func, output = exec references[j], gold[j] + refs_stats.update feedback, func, output + # skip example? + if skip||!hope||!fear + puts "NO GOOD FEAR/HOPE, skipping example\n\n" + next + end + puts + # update + w = update w, hope, fear, opts[:eta] if !opts[:no_update] + # normalize weight vector to length 1 + w.normalize! if opts[:normalize] + # stopx after x examples + break if opts[:stop_after]>0 && (j+1)==opts[:stop_after] + } + # keep weight files for each iteration + if opts[:iterate] > 1 + FileUtils::cp(last_wf, "#{opts[:output_weights]}.#{iter}") + else + FileUtils::cp(last_wf, opts[:output_weights]) + end + # output stats + puts "iteration ##{iter+1}/#{opts[:iterate]}" + puts "#{count} examples" + puts " type1 updates: #{type1_updates}" + puts " type2 updates: #{type2_updates}" + puts " top1 hits: #{top1_hit}" + puts " top1 variant: #{top1_variant}" + puts "top1 real variant: #{top1_real_variant}" + puts " hope hits: #{hope_hit}" + puts " hope variant: #{hope_variant}" + puts "hope real variant: #{hope_real_variant}" + puts " kbest size: #{(kbest_sz/count).round 2}" + puts "#{((without_translations.to_f/count)*100).round 2}% without translations (abs: #{without_translations})" + puts "#{((no_proper_gold_output.to_f/count)*100).round 2}% no good gold output (abs: #{no_proper_gold_output})" + puts top1_stats.print count + puts hope_stats.print count + puts fear_stats.print count + puts refs_stats.print count +} +end + + +main + diff --git a/rampion_with_feedback.rb b/rampion_with_feedback.rb deleted file mode 100755 index ca99272..0000000 --- a/rampion_with_feedback.rb +++ /dev/null @@ -1,558 +0,0 @@ -#!/usr/bin/env ruby - -require 'trollop' -require 'tempfile' -require 'open3' -require 'memcached' - - -SMT_SEMPARSE = 'python /workspace/grounded/smt-semparse-cp/decode_sentence.py /workspace/grounded/smt-semparse-cp/working/full_dataset' -EVAL_PL = '/workspace/grounded/wasp-1.0/data/geo-funql/eval/eval.pl' -CDEC = "/toolbox/cdec-dtrain/bin/cdec" - -$cache = Memcached.new("localhost:11211") - -# execute -def exec natural_language_string, reference_output, no_output=false - func = nil - output = nil - feedback = nil - key_prefix = natural_language_string.encode("ASCII", :invalid => :replace, :undef => :replace, :replace => "?").gsub(/ /,'_') - begin - func = $cache.get key_prefix+"__FUNC" - output = $cache.get key_prefix+"__OUTPUT" - feedback = $cache.get key_prefix+"__FEEDBACK" - rescue Memcached::NotFound - func = `#{SMT_SEMPARSE} "#{natural_language_string}"`.strip - output = `echo "execute_funql_query(#{func}, X)." | swipl -s #{EVAL_PL} 2>&1 | grep "X ="`.strip.split('X = ')[1].strip - feedback = output==reference_output - begin - $cache.set key_prefix+"__FUNC", func - $cache.set key_prefix+"__OUTPUT", output - $cache.set key_prefix+"__FEEDBACK", feedback - rescue SystemExit, Interrupt - $cache.delete key_prefix+"__FUNC" - $cache.delete key_prefix+"__OUTPUT" - $cache.delete key_prefix+"__FEEDBACK" - end - end - puts " nrl: #{natural_language_string}" if !no_output - puts " mrl: #{func}" if !no_output - puts " output: #{output}" if !no_output - puts " correct?: #{feedback}" if !no_output - return feedback, func, output -end - -# decoder interaction/translations -class Translation - attr_accessor :s, :f, :rank, :model, :score - - def initialize kbest_line, rank=-1 - a = kbest_line.split ' ||| ' - @s = a[1].strip - h = {} - a[2].split.each { |i| - name, value = i.split '=' - value = value.to_f - h[name] = value - } - @f = NamedSparseVector.new h - @rank = rank - @model = a[3].to_f - @score = -1.0 - end - - def to_s - "#{@rank} ||| #{@s} ||| #{@model} ||| #{@score} ||| #{@f.to_s}" - end -end - -def predict_translation s, k, ini, w - o, s = Open3.capture2 "echo \"#{s}\" | #{CDEC} -c #{ini} -r -k #{k} -w #{w} 2>/dev/null" - j = -1 - return o.split("\n").map{|i| j+=1; Translation.new(i, j)} -end - -# scoring (per-sentence BLEU) -def ngrams_it(s, n, fix=false) - a = s.strip.split - a.each_with_index { |tok, i| - tok.strip! - 0.upto([n-1, a.size-i-1].min) { |m| - yield a[i..i+m] if !(fix||(a[i..i+m].size>n)) - } - } -end - -def brevity_penalty hypothesis, reference - a = hypothesis.split; b = reference.split - return 1.0 if a.size>b.size - return Math.exp(1.0 - b.size.to_f/a.size); -end - -def per_sentence_bleu hypothesis, reference, n=4 - h_ng = {}; r_ng = {} - (1).upto(n) {|i| h_ng[i] = []; r_ng[i] = []} - ngrams_it(hypothesis, n) {|i| h_ng[i.size] << i} - ngrams_it(reference, n) {|i| r_ng[i.size] << i} - m = [n, reference.split.size].min - weight = 1.0/m - add = 0.0 - sum = 0 - (1).upto(m) { |i| - counts_clipped = 0 - counts_sum = h_ng[i].size - h_ng[i].uniq.each {|j| counts_clipped += r_ng[i].count(j)} - add = 1.0 if i >= 2 - sum += weight * Math.log((counts_clipped + add)/(counts_sum + add)); - } - return brevity_penalty(hypothesis, reference) * Math.exp(sum) -end - -def score_translations list_of_translations, reference - list_of_translations.each { |i| i.score = per_sentence_bleu i.s, reference} -end - -# hope and fear -def hope_and_fear kbest, action - max = -1.0/0 - max_idx = -1 - kbest.each_with_index { |i,j| - if action=='hope' && i.model + i.score > max - max_idx = j; max = i.model + i.score - end - if action=='fear' && i.model - i.score > max - max_idx = j; max = i.model - i.score - end - } - return kbest[max_idx] -end - -# update -def update w, hope, fear, eta - diff = hope.f - fear.f - diff *= eta - w += diff - return w -end - -# weights -class NamedSparseVector - attr_accessor :h - - def initialize init=nil - @h = {} - @h = init if init - @h.default = 0.0 - end - - def + other - new_h = Hash.new - new_h.update @h - ret = NamedSparseVector.new new_h - other.each_pair { |k,v| ret[k]+=v } - return ret - end - - def from_file fn - f = File.new(fn, 'r') - while line = f.gets - name, value = line.strip.split - value = value.to_f - @h[name] = value - end - end - - def to_file - s = [] - @h.each_pair { |k,v| s << "#{k} #{v}" } - s.join("\n")+"\n" - end - - def - other - new_h = Hash.new - new_h.update @h - ret = NamedSparseVector.new new_h - other.each_pair { |k,v| ret[k]-=v } - return ret - end - - def * scalar - raise ArgumentError, "Arg is not numeric #{scalar}" unless scalar.is_a? Numeric - ret = NamedSparseVector.new - @h.keys.each { |k| ret[k] = @h[k]*scalar } - return ret - end - - def dot other - sum = 0.0 - @h.each_pair { |k,v| - sum += v * other[k] - } - return sum - end - - def [] k - @h[k] - end - - def []= k, v - @h[k] = v - end - - def each_pair - @h.each_pair { |k,v| yield k,v } - end - - def to_s - @h.to_s - end - - def length - Math.sqrt(@h.values.map{|i|i*i}.inject(:+)) - end - - def normalize! - l = length - @h.each_pair { |k,v| - @h[k] = v/l - } - end - - def size - @h.keys.size - end -end - -# map models score to [0,1] -def adj_model kbest, factor - min = kbest.map{|i|i.model}.min - max = kbest.map{|i|i.model}.max - kbest.each {|i| i.model = factor*((i.model-min)/(max-min))} -end - -class Stats - def initialize name - @name = name - @with_parse = 0.0 - @with_output = 0.0 - @correct_output = 0.0 - end - - def update feedback, func, output - @with_parse +=1 if func!="None"&&func!='' - @with_output +=1 if output!="null"&&output!='' - @correct_output += 1 if feedback==true - end - - def print total - without_parse = total-@with_parse -<<-eos - [#{@name}] - #{@name} with parse #{((@with_parse/total)*100).round 2} adj:#{((@with_parse/(total-without_parse))*100).round 2} abs:#{@with_parse} - #{@name} with output #{((@with_output/total)*100).round 2} adj:#{((@with_output/(total-without_parse))*100).round 2} abs:#{@with_output} -#{@name} with correct output #{((@correct_output/total)*100).round 2} adj:#{((@correct_output/(total-without_parse))*100).round 2} abs:#{@correct_output} -eos - end -end - -def _print rank, string, model, score - puts "rank=#{rank} string='#{string}' model=#{model} score=#{score}" -end - -def bag_of_words s, stopwords=[] - s.split.uniq.sort.reject{|v| stopwords.include? v} -end - -def get_hope_fear_standard kbest, feedback - hope = nil; fear = nil - if feedback == true - hope = kbest[0] - else - hope = hope_and_fear(kbest, 'hope') - end - fear = hope_and_fear(kbest, 'fear') - return hope, fear -end - -def get_hope_fear_standard kbest, feedback - hope = nil; fear = nil - if feedback == true - hope = kbest[0] - else - hope = hope_and_fear(kbest, 'hope') - end - fear = hope_and_fear(kbest, 'fear') - return hope, fear -end - -def main - opts = Trollop::options do - # data - opt :k, "k", :type => :int, :required => true - opt :input, "'foreign' input", :type => :string, :required => true - opt :references, "(parseable) references", :type => :string, :required => true - opt :gold, "gold output", :type => :string, :require => true - opt :gold_mrl, "gold parse", :type => :string, :short => '-h', :require => true - opt :init_weights, "initial weights", :type => :string, :required => true, :short => '-w' - opt :cdec_ini, "cdec config file", :type => :string, :default => './cdec.ini' - # output - opt :debug, "debug output", :type => :bool, :default => false - opt :no_update, "don't update weights", :type => :bool, :default => false - opt :output_weights, "output file for final weights", :type => :string, :required => true - opt :stop_after, "stop after x examples", :type => :int, :default => -1 - opt :print_kbests, "print full kbest lists", :type => :bool, :default => false, :short => '-j' - # misc parameters - opt :eta, "learning rate", :type => :float, :default => 0.01 - opt :scale_model, "scale model score by this factor", :type => :float, :default => 1.0, :short => '-m' - opt :normalize, "normalize weights after each update", :type => :bool, :default => false, :short => '-l' - # learning parameters - opt :iterate, "iteration X epochs", :type => :int, :default => 1, :short => '-u' - opt :real, "'real' rampion updates", :type => :bool, :default => false, :short => '-q' - opt :only_exec, "update only when top1 executes!", :default => false, :short => '-d' - opt :hope2, "select hope from the first X items in kbest that executes", :type => :int, :default => 0, :short => '-x' - opt :hope3, "skip example if hope doesn't execute", :type => :bool, :default => false, :short => '-b' - opt :variant, "use top1 as fear if it does not execute", :type => :bool, :default => false - opt :fear2, "skip example if fear executes", :type => :bool, :default => false - opt :skip_on_no_proper_gold, "skip if the reference didn't produce a proper gold output", :default => false, :short => '-n' - end - - # output configuration - puts "cfg" - opts.each_pair {|k,v| puts "#{k}=#{v}"} - puts - - # read files - input = File.readlines(opts[:input], :encoding=>'utf-8').map{|i|i.strip} - references = File.readlines(opts[:references], :encoding=>'utf-8').map{|i|[i.strip, nil]} - references_own = references.map{|i|false} - gold = File.readlines(opts[:gold], :encoding=>'utf-8').map{|i|i.strip} - gold_mrl = File.readlines(opts[:gold_mrl], :encoding=>'utf-8').map{|i|i.strip} - stopwords = File.readlines('d/stopwords.en', :encoding=>'utf-8').map{|i|i.strip} - - # init weights - w = NamedSparseVector.new - w.from_file opts[:init_weights] - last_wf = '' - -# iterate -opts[:iterate].times { |iter| - # numerous counters - without_translations = 0 - no_proper_gold_output = 0 - count = 0 - top1_stats = Stats.new 'top1' - hope_stats = Stats.new 'hope' - fear_stats = Stats.new 'fear' - refs_stats = Stats.new 'refs' - type1_updates = 0 - type2_updates = 0 - top1_hit = 0 - top1_variant = 0 - top1_real_variant = 0 - hope_hit = 0 - hope_variant = 0 - hope_real_variant = 0 - kbest_sz = 0 - - # for each example - input.each_with_index { |i,j| - count += 1 - # write current weights to file - tmp_file = Tempfile.new('rampion') - tmp_file_path = tmp_file.path - last_wf = tmp_file.path - tmp_file.write w.to_file - tmp_file.close - # get kbest list for current input - kbest = predict_translation i, opts[:k], opts[:cdec_ini], tmp_file_path - kbest_sz += kbest.size - # output - puts "EXAMPLE #{j}" - puts "GOLD MRL: #{gold_mrl[j]}" - puts "GOLD OUTPUT #{gold[j]}" - # skip if no translation could be produced - if kbest.size == 0 - without_translations += 1 - puts "NO MT OUTPUT, skipping example\n\n" - next - end - # no - if gold[j] == '[]' || gold[j] == '[...]' - no_proper_gold_output += 1 - if opts[:skip_on_no_proper_gold] - puts "NO PROPER GOLD OUTPUT, skipping example\n\n" - next - end - end - # score kbest list - score_translations kbest, references[j][0] - # print kbest list - if opts[:print_kbests] - puts "<<>>" - end - # adjust model scores to fit in [0,1] - adj_model kbest, opts[:scale_model] - # top1 - puts "---top1" - puts "TOP1 TRANSLATION: #{kbest[0].s}" if iter+1==opts[:iterate] - _print 0, kbest[0].s, kbest[0].model, kbest[0].score - feedback, func, output = exec kbest[0].s, gold[j] - top1_stats.update feedback, func, output - # reference as bag of words - ref_words = bag_of_words references[j][0], stopwords - - - - - # hope2 - hope_idx = nil - if opts[:hope2] > 0 - (1).upto([opts[:hope2]-1, kbest.size-1].min) { |l| - f = exec kbest[l].s, gold[j], true - if f[0] - hope_idx = l - next - end - } - end - hope = nil; fear = nil - if opts[:real] - if kbest[0].s != references[j][0] - hope = hope_and_fear(kbest, 'hope') - fear = kbest[0] - else - hope = kbest[0] - fear = hope_and_fear(kbest, 'fear') - end - elsif feedback==true - type1_updates += 1 - if kbest[0].s == references[j][0] - top1_hit +=1 - else - top1_variant += 1 - if bag_of_words(kbest[0].s,stopwords) != ref_words - top1_real_variant += 1 - if opts[:debug] - puts "<<>>" - end - end - end - if opts[:only_exec] - references[j] = [kbest[0].s, kbest[0]] - references_own[j] = true - end - hope = kbest[0] - elsif opts[:only_exec] - if references_own[j] - hope = references[j][1] - else - puts "CANNOT FIND HOPE BC NO TOP1 DOESN'T EXEC, skipping example\n\n" - next - end - else - type2_updates += 1 - if opts[:variant] - fear = kbest[0] - end - if opts[:hope2] > 0 - if hope_idx - hope = kbest[hope_idx] - else - puts "NO GOOD HOPE, skipping example\n\n" - next - end - else - hope = hope_and_fear kbest, 'hope' - if opts[:hope3] - f = exec hope.s, gold[j], true - if !f[0] - puts "HOPE NO +FEEDBACK, skipping example\n\n" - next - end - end - end - if hope.s == references[j][0] - hope_hit += 1 - else - hope_variant += 1 - hope_real_variant += 1 if bag_of_words(hope.s,stopwords)!=ref_words - end - end - fear = hope_and_fear(kbest, 'fear') if !fear - if opts[:fear2] - f = exec fear.s, gold[j], true - f = f[0] - if f - puts "FEAR EXECUTED, skipping example\n\n" - next - end - end - - - - - # output info for current example - puts "---hope" - _print hope.rank, hope.s, hope.model, hope.score - feedback, func, output = exec hope.s, gold[j] - hope_stats.update feedback, func, output - puts "---fear" - _print fear.rank, fear.s, fear.model, fear.score - feedback, func, output = exec fear.s, gold[j] - fear_stats.update feedback, func, output - puts "---reference" - _print 'x', references[j][0], 'x', 1.0 - feedback, func, output = exec references[j][0], gold[j] - refs_stats.update feedback, func, output - puts - - # update - w = update w, hope, fear, opts[:eta] if !opts[:no_update] - - # normalize weight vector to length 1 - w.normalize! if opts[:normalize] - - # stopx after x examples - break if opts[:stop_after]>0 && (j+1)==opts[:stop_after] - } - - # keep weight files for each iteration - if opts[:iterate] > 1 - FileUtils::cp(last_wf, "#{opts[:output_weights]}.#{iter}") - else - FileUtils::cp(last_wf, opts[:output_weights]) - end - - # output stats - puts "iteration ##{iter}/#{opts[:iterate]}" - puts "#{count} examples" - puts " type1 updates: #{type1_updates}" - puts " type2 updates: #{type2_updates}" - puts " top1 hits: #{top1_hit}" - puts " top1 variant: #{top1_variant}" - puts "top1 real variant: #{top1_real_variant}" - puts " hope hits: #{hope_hit}" - puts " hope variant: #{hope_variant}" - puts "hope real variant: #{hope_real_variant}" - puts " kbest size: #{(kbest_sz/count).round 2}" - puts "#{((without_translations.to_f/count)*100).round 2}% without translations (abs: #{without_translations})" - puts "#{((no_proper_gold_output.to_f/count)*100).round 2}% no good gold output (abs: #{no_proper_gold_output})" - puts top1_stats.print count - puts hope_stats.print count - puts fear_stats.print count - puts refs_stats.print count -} -end - - -main - -- cgit v1.2.3