From 8f3b6a8889bc1b8a18f14e947360e0a8bee808b7 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Wed, 8 Jan 2014 18:11:48 +0100 Subject: too much to say --- rampfion.rb | 657 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 657 insertions(+) create mode 100755 rampfion.rb (limited to 'rampfion.rb') diff --git a/rampfion.rb b/rampfion.rb new file mode 100755 index 0000000..ce40917 --- /dev/null +++ b/rampfion.rb @@ -0,0 +1,657 @@ +#!/usr/bin/env ruby + +require 'trollop' +require 'tempfile' +require 'open3' +require 'memcached' +require 'timeout' + + +SMT_SEMPARSE = 'python /workspace/grounded/smt-semparse-cp/decode_sentence.py /workspace/grounded/smt-semparse-cp/working/full_dataset 2>/dev/null' +EVAL_PL = '/workspace/grounded/wasp-1.0/data/geo-funql/eval/eval.pl' +CDEC = "/toolbox/cdec-dtrain/bin/cdec" + +$cache = Memcached.new("localhost:11211") + +# the semantic parser hangs sometimes +def spawn_with_timeout cmd, t=4, debug=false + puts cmd if debug + pipe_in, pipe_out = IO.pipe + pid = Process.spawn(cmd, :out => pipe_out) + begin + Timeout.timeout(t) { Process.wait pid } + rescue Timeout::Error + return "" + # accept the zombies + #Process.kill('TERM', pid) + end + pipe_out.close + return pipe_in.read +end + +# execute +def exec natural_language_string, reference_output, no_output=false + func = nil + output = nil + feedback = nil + key_prefix = natural_language_string.encode("ASCII", :invalid => :replace, :undef => :replace, :replace => "?").gsub(/ /,'_') + begin + func = $cache.get key_prefix+"__FUNC" + output = $cache.get key_prefix+"__OUTPUT" + feedback = $cache.get key_prefix+"__FEEDBACK" + rescue Memcached::NotFound + func = spawn_with_timeout("#{SMT_SEMPARSE} \"#{natural_language_string}\"").strip + output = spawn_with_timeout("echo \"execute_funql_query(#{func}, X).\" | swipl -s #{EVAL_PL} 2>&1 | grep \"X =\"").strip.split('X = ')[1] + feedback = output==reference_output + begin + $cache.set key_prefix+"__FUNC", func + $cache.set key_prefix+"__OUTPUT", output + $cache.set key_prefix+"__FEEDBACK", feedback + rescue SystemExit, Interrupt + $cache.delete key_prefix+"__FUNC" + $cache.delete key_prefix+"__OUTPUT" + $cache.delete key_prefix+"__FEEDBACK" + end + end + puts " nrl: #{natural_language_string}" if !no_output + puts " mrl: #{func}" if !no_output + puts " output: #{output}" if !no_output + puts " correct?: #{feedback}" if !no_output + return feedback, func, output +end + +# decoder interaction/translations +class Translation + attr_accessor :s, :f, :rank, :model, :score + + def initialize kbest_line, rank=-1 + a = kbest_line.split ' ||| ' + @s = a[1].strip + h = {} + a[2].split.each { |i| + name, value = i.split '=' + value = value.to_f + h[name] = value + } + @f = NamedSparseVector.new h + @rank = rank + @model = a[3].to_f + @score = -1.0 + end + + def to_s + "#{@rank} ||| #{@s} ||| #{@model} ||| #{@score} ||| #{@f.to_s}" + end +end + +def predict_translation s, k, ini, w + o, s = Open3.capture2 "echo \"#{s}\" | #{CDEC} -c #{ini} -r -k #{k} -w #{w} 2>/dev/null" + j = -1 + return o.split("\n").map{|i| j+=1; Translation.new(i, j)} +end + +# scoring (per-sentence BLEU) +def ngrams_it(s, n, fix=false) + a = s.strip.split + a.each_with_index { |tok, i| + tok.strip! + 0.upto([n-1, a.size-i-1].min) { |m| + yield a[i..i+m] if !(fix||(a[i..i+m].size>n)) + } + } +end + +def brevity_penalty hypothesis, reference + a = hypothesis.split; b = reference.split + return 1.0 if a.size>b.size + return Math.exp(1.0 - b.size.to_f/a.size); +end + +def per_sentence_bleu hypothesis, reference, n=4 + h_ng = {}; r_ng = {} + (1).upto(n) {|i| h_ng[i] = []; r_ng[i] = []} + ngrams_it(hypothesis, n) {|i| h_ng[i.size] << i} + ngrams_it(reference, n) {|i| r_ng[i.size] << i} + m = [n, reference.split.size].min + weight = 1.0/m + add = 0.0 + sum = 0 + (1).upto(m) { |i| + counts_clipped = 0 + counts_sum = h_ng[i].size + h_ng[i].uniq.each {|j| counts_clipped += r_ng[i].count(j)} + add = 1.0 if i >= 2 + sum += weight * Math.log((counts_clipped + add)/(counts_sum + add)); + } + return brevity_penalty(hypothesis, reference) * Math.exp(sum) +end + +def score_translations list_of_translations, reference + list_of_translations.each { |i| i.score = per_sentence_bleu i.s, reference} +end + +# hope and fear +def hope_and_fear kbest, action + max = -1.0/0 + max_idx = -1 + kbest.each_with_index { |i,j| + if action=='hope' && i.model + i.score > max + max_idx = j; max = i.model + i.score + end + if action=='fear' && i.model - i.score > max + max_idx = j; max = i.model - i.score + end + } + return kbest[max_idx] +end + +# update +def update w, hope, fear, eta + diff = hope.f - fear.f + diff *= eta + w += diff + return w +end + +# weights +class NamedSparseVector + attr_accessor :h + + def initialize init=nil + @h = {} + @h = init if init + @h.default = 0.0 + end + + def + other + new_h = Hash.new + new_h.update @h + ret = NamedSparseVector.new new_h + other.each_pair { |k,v| ret[k]+=v } + return ret + end + + def from_file fn + f = File.new(fn, 'r') + while line = f.gets + name, value = line.strip.split + value = value.to_f + @h[name] = value + end + end + + def to_file + s = [] + @h.each_pair { |k,v| s << "#{k} #{v}" } + s.join("\n")+"\n" + end + + def - other + new_h = Hash.new + new_h.update @h + ret = NamedSparseVector.new new_h + other.each_pair { |k,v| ret[k]-=v } + return ret + end + + def * scalar + raise ArgumentError, "Arg is not numeric #{scalar}" unless scalar.is_a? Numeric + ret = NamedSparseVector.new + @h.keys.each { |k| ret[k] = @h[k]*scalar } + return ret + end + + def dot other + sum = 0.0 + @h.each_pair { |k,v| + sum += v * other[k] + } + return sum + end + + def [] k + @h[k] + end + + def []= k, v + @h[k] = v + end + + def each_pair + @h.each_pair { |k,v| yield k,v } + end + + def to_s + @h.to_s + end + + def length + Math.sqrt(@h.values.map{|i|i*i}.inject(:+)) + end + + def normalize! + l = length + @h.each_pair { |k,v| + @h[k] = v/l + } + end + + def size + @h.keys.size + end +end + +# map models score to [0,1] +def adj_model kbest, factor + min = kbest.map{|i|i.model}.min + max = kbest.map{|i|i.model}.max + kbest.each {|i| i.model = factor*((i.model-min)/(max-min))} +end + +class Stats + def initialize name + @name = name + @with_parse = 0.0 + @with_output = 0.0 + @correct_output = 0.0 + end + + def update feedback, func, output + @with_parse +=1 if func!="None"&&func!='' + @with_output +=1 if output!="null"&&output!='' + @correct_output += 1 if feedback==true + end + + def print total + without_parse = total-@with_parse +<<-eos + [#{@name}] + #{@name} with parse #{((@with_parse/total)*100).round 2} abs:#{@with_parse} + #{@name} with output #{((@with_output/total)*100).round 2} abs:#{@with_output} +#{@name} with correct output #{((@correct_output/total)*100).round 2} adj:#{((@correct_output/(total-without_parse))*100).round 2} abs:#{@correct_output} +eos + end +end + +def _print rank, string, model, score + puts "rank=#{rank} string='#{string}' model=#{model} score=#{score}" +end + +def bag_of_words s, stopwords=[] + s.split.uniq.sort.reject{|v| stopwords.include? v} +end + +def gethopefear_standard kbest, feedback + hope = fear = nil + type1 = type2 = false + if feedback == true + hope = kbest[0] + type1 = true + else + hope = hope_and_fear(kbest, 'hope') + type2 = true + end + fear = hope_and_fear(kbest, 'fear') + return hope, fear, false, type1, type2 +end + +def gethopefear_fear_no_exec kbest, feedback, gold, max + hope = fear = nil + type1 = type2 = false + if feedback == true + hope = kbest[0] + type1 = true + else + hope = hope_and_fear(kbest, 'hope') + type2 = true + end + kbest.sort{|x,y|(y.model+y.score)<=>(x.model+x.score)}.each_with_index { |k,i| + break if i==max + if !exec(k.s, gold, true)[0] + fear = k + break + end + } + skip=true if !fear + return hope, fear, skip, type1, type2 +end + +def gethopefear_fear_no_exec_skip kbest, feedback, gold + hope = fear = nil + type1 = type2 = false + if feedback == true + hope = kbest[0] + type1 = true + else + hope = hope_and_fear(kbest, 'hope') + type2 = true + end + fear = hope_and_fear(kbest, 'fear') + skip = exec(fear.s, gold, true)[0] + return hope, fear, skip, type1, type2 +end + +def gethopefear_fear_no_exec_hope_exec kbest, feedback, gold, max + hope = fear = nil; hope_idx = 0 + type1 = type2 = false + sorted_kbest = kbest.sort{|x,y|(y.model+y.score)<=>(x.model+x.score)} + if feedback == true + hope = kbest[0] + type1 = true + else + sorted_kbest.each_with_index { |k,i| + next if i==0 + break if i==max + if exec(k.s, gold, true)[0] + hope_idx = i + hope = k + break + end + } + type2 = true + end + sorted_kbest.each_with_index { |k,i| + break if i>(kbest.size-(hope_idx+1))||i==max + if !exec(k.s, gold, true)[0] + fear = k + break + end + } + skip = true if !hope||!fear + return hope, fear, skip, type1, type2 +end + +def gethopefear_only_exec kbest, feedback, gold, max, own_reference=nil + hope = fear = nil; hope_idx = 0; new_reference = nil + type1 = type2 = false + if feedback == true + hope = kbest[0] + new_reference = hope + type1 = true + elsif own_reference + hope = own_reference + type1 = true + else + kbest.each_with_index { |k,i| + next if i==0 + break if i==max + if exec(k.s, gold, true)[0] + hope_idx = i + hope = k + break + end + } + type2 = true + end + kbest.each_with_index { |k,i| + next if i==0||i==hope_idx + break if i==max + if !exec(k.s, gold, true)[0] + fear = k + break + end + } + skip = true if !hope||!fear + return hope, fear, skip, type1, type2, new_reference +end + +def gethopefear_only_exec_simple kbest, feedback, gold, max, own_reference=nil + hope = fear = nil; hope_idx = 0; new_reference = nil + type1 = type2 = false + if feedback == true + hope = kbest[0] + new_reference = hope + type1 = true + elsif own_reference + hope = own_reference + type1 = true + else + kbest.each_with_index { |k,i| + next if i==0 + break if i==max + if exec(k.s, gold, true)[0] + hope_idx = i + hope = k + break + end + } + type2 = true + end + kbest.each_with_index { |k,i| + next if i==0||i==hope_idx + break if i==max + if !exec(k.s, gold, true)[0] + fear = k + break + end + } + skip = true if !hope||!fear + return hope, fear, skip, type1, type2, new_reference +end + +def gethopefear_rampion kbest, reference + hope = fear = nil + type1 = type2 = false + if kbest[0].s == reference + hope = kbest[0] + fear = hope_and_fear(kbest, 'fear') + type1 = true + else + hope = hope_and_fear(kbest, 'hope') + fear = kbest[0] + type2 = true + end + return hope, fear, false, type1, type2 +end + +def main + opts = Trollop::options do + # data + opt :k, "k", :type => :int, :default => 10000 + opt :hope_fear_max, "asdf", :type => :int, :default => 32, :short => '-q' + opt :input, "'foreign' input", :type => :string, :required => true + opt :references, "(parseable) references", :type => :string, :required => true + opt :gold, "gold output", :type => :string, :require => true + opt :gold_mrl, "gold parse", :type => :string, :short => '-h', :require => true + opt :init_weights, "initial weights", :type => :string, :required => true, :short => '-w' + opt :cdec_ini, "cdec config file", :type => :string, :default => './cdec.ini' + # output + opt :debug, "debug output", :type => :bool, :default => false + opt :output_weights, "output file for final weights", :type => :string, :required => true + opt :stop_after, "stop after x examples", :type => :int, :default => -1 + opt :print_kbests, "print full kbest lists", :type => :bool, :default => false, :short => '-l' + # important parameters + opt :eta, "learning rate", :type => :float, :default => 0.01 + opt :iterate, "iteration X epochs", :type => :int, :default => 1, :short => '-j' + opt :variant, "standard, rampion, fear_no_exec, fear_no_exec_skip, fear_no_exec_hope_exec, only_exec", :default => 'standard' + # misc parameters + opt :scale_model, "scale model score by this factor", :type => :float, :default => 1.0, :short => '-m' + opt :normalize, "normalize weights after each update", :type => :bool, :default => false, :short => '-n' + opt :skip_on_no_proper_gold, "skip if the reference didn't produce a proper gold output", :default => false, :short => '-x' + opt :no_update, "don't update weights", :type => :bool, :default => false, :short => '-y' + end + # output configuration + puts "cfg" + opts.each_pair {|k,v| puts "#{k}=#{v}"} + puts + # read files + input = File.readlines(opts[:input], :encoding=>'utf-8').map{|i|i.strip} + references = File.readlines(opts[:references], :encoding=>'utf-8').map{|i|i.strip} + gold = File.readlines(opts[:gold], :encoding=>'utf-8').map{|i|i.strip} + gold_mrl = File.readlines(opts[:gold_mrl], :encoding=>'utf-8').map{|i|i.strip} + stopwords = File.readlines('d/stopwords.en', :encoding=>'utf-8').map{|i|i.strip} + # only_exec: new refs + own_references = nil + own_references = references.map{|i|nil} if opts[:variant]== 'only_exec' + # init weights + w = NamedSparseVector.new + w.from_file opts[:init_weights] + last_wf = '' +# iterate +opts[:iterate].times { |iter| + # numerous counters + without_translations = 0 + no_proper_gold_output = 0 + count = 0 + top1_stats = Stats.new 'top1' + hope_stats = Stats.new 'hope' + fear_stats = Stats.new 'fear' + refs_stats = Stats.new 'refs' + type1_updates = 0 + type2_updates = 0 + top1_hit = 0 + top1_variant = 0 + top1_real_variant = 0 + hope_hit = 0 + hope_variant = 0 + hope_real_variant = 0 + kbest_sz = 0 + # for each example + input.each_with_index { |i,j| + count += 1 + # write current weights to file + tmp_file = Tempfile.new('rampion') + tmp_file_path = tmp_file.path + last_wf = tmp_file.path + tmp_file.write w.to_file + tmp_file.close + # get kbest list for current input + kbest = predict_translation i, opts[:k], opts[:cdec_ini], tmp_file_path + kbest_sz += kbest.size + # output + puts "EXAMPLE #{j}" + puts "GOLD MRL: #{gold_mrl[j]}" + puts "GOLD OUTPUT #{gold[j]}" + # skip if no translation could be produced + if kbest.size == 0 + without_translations += 1 + puts "NO MT OUTPUT, skipping example\n\n" + next + end + # no proper gold + if gold[j] == '[]' || gold[j] == '[...]' || gold[j] == '[].' + no_proper_gold_output += 1 + if opts[:skip_on_no_proper_gold] + puts "NO PROPER GOLD OUTPUT, skipping example\n\n" + next + end + end + # score kbest list + score_translations kbest, references[j] + # print kbest list + if opts[:print_kbests] + puts "<<>>" + end + # adjust model scores to fit in [0,1] + adj_model kbest, opts[:scale_model] + # top1 + puts "---top1" + puts "TOP1 TRANSLATION: #{kbest[0].s}" if iter+1==opts[:iterate] + _print 0, kbest[0].s, kbest[0].model, kbest[0].score + feedback, func, output = exec kbest[0].s, gold[j] + top1_stats.update feedback, func, output + # reference as bag of words + ref_words = bag_of_words references[j], stopwords + # hope and fear + hope = fear = new_reference = nil + type1 = type2 = skip = false + if opts[:variant] == 'standard' + hope, fear, skip, type1, type2 = gethopefear_standard kbest, feedback + elsif opts[:variant] == 'rampion' + hope, fear, skip, type1, type2 = gethopefear_rampion kbest, references[j] + elsif opts[:variant] == 'fear_no_exec_skip' + hope, fear, skip, type1, type2 = gethopefear_fear_no_exec_skip kbest, feedback, gold[j] + elsif opts[:variant] == 'fear_no_exec' + hope, fear, skip, type1, type2 = gethopefear_fear_no_exec kbest, feedback, gold[j], opts[:hope_fear_max] + elsif opts[:variant] == 'fear_no_exec_hope_exec' + hope, fear, skip, type1, type2 = gethopefear_fear_no_exec_hope_exec kbest, feedback, gold[j], opts[:hope_fear_max] + elsif opts[:variant] == 'only_exec' + hope, fear, skip, type1, type2, new_reference = gethopefear_only_exec kbest, feedback, gold[j], opts[:hope_fear_max], own_references[j] + else + puts "no such hope/fear variant" + exit 1 + end + # new reference (only_exec) + if new_reference + own_references[j] = new_reference + end + # type1/type2 + type1_updates+=1 if type1 + type2_updates+=1 if type2 + # top1/hope hit + if kbest[0].s == references[j] + top1_hit += 1 + else + top1_variant += 1 + top1_real_variant += 1 if bag_of_words(kbest[0].s,stopwords)!=ref_words + end + if hope&&hope.s == references[j] + hope_hit += 1 + elsif hope + hope_variant += 1 + hope_real_variant += 1 if bag_of_words(hope.s,stopwords)!=ref_words + end + # output info for current example + puts "---hope" + if hope + _print hope.rank, hope.s, hope.model, hope.score + feedback, func, output = exec hope.s, gold[j] + hope_stats.update feedback, func, output + end + puts "---fear" + if fear + _print fear.rank, fear.s, fear.model, fear.score + feedback, func, output = exec fear.s, gold[j] + fear_stats.update feedback, func, output + end + puts "---reference" + _print 'x', references[j], 'x', 1.0 + feedback, func, output = exec references[j], gold[j] + refs_stats.update feedback, func, output + # skip example? + if skip||!hope||!fear + puts "NO GOOD FEAR/HOPE, skipping example\n\n" + next + end + puts + # update + w = update w, hope, fear, opts[:eta] if !opts[:no_update] + # normalize weight vector to length 1 + w.normalize! if opts[:normalize] + # stopx after x examples + break if opts[:stop_after]>0 && (j+1)==opts[:stop_after] + } + # keep weight files for each iteration + if opts[:iterate] > 1 + FileUtils::cp(last_wf, "#{opts[:output_weights]}.#{iter}") + else + FileUtils::cp(last_wf, opts[:output_weights]) + end + # output stats + puts "iteration ##{iter+1}/#{opts[:iterate]}" + puts "#{count} examples" + puts " type1 updates: #{type1_updates}" + puts " type2 updates: #{type2_updates}" + puts " top1 hits: #{top1_hit}" + puts " top1 variant: #{top1_variant}" + puts "top1 real variant: #{top1_real_variant}" + puts " hope hits: #{hope_hit}" + puts " hope variant: #{hope_variant}" + puts "hope real variant: #{hope_real_variant}" + puts " kbest size: #{(kbest_sz/count).round 2}" + puts "#{((without_translations.to_f/count)*100).round 2}% without translations (abs: #{without_translations})" + puts "#{((no_proper_gold_output.to_f/count)*100).round 2}% no good gold output (abs: #{no_proper_gold_output})" + puts top1_stats.print count + puts hope_stats.print count + puts fear_stats.print count + puts refs_stats.print count +} +end + + +main + -- cgit v1.2.3