diff options
-rwxr-xr-x | rampion_with_feedback.rb | 334 |
1 files changed, 215 insertions, 119 deletions
diff --git a/rampion_with_feedback.rb b/rampion_with_feedback.rb index 70bf463..a69a922 100755 --- a/rampion_with_feedback.rb +++ b/rampion_with_feedback.rb @@ -5,20 +5,20 @@ require 'tempfile' require 'open3' -# execute -SMT_SEMPARSE = '/workspace/grounded/mosesdecoder/moses-chart-cmd/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/moses_chart -f /workspace/grounded/smt-semparse/latest/model/moses.ini 2>/dev/null' +SMT_SEMPARSE = 'python /workspace/grounded/smt-semparse-cp/decode_sentence.py /workspace/grounded/smt-semparse-cp/working/full_dataset' EVAL_PL = '/workspace/grounded/wasp-1.0/data/geo-funql/eval/eval.pl' -def exec natural_language_string, reference_output - flat_mrl = `echo "#{natural_language_string}" | ./stem.py | #{SMT_SEMPARSE}`.strip - func = `echo "#{flat_mrl}" | ./functionalize.py 2>/dev/null`.strip - res = `echo "execute_funql_query(#{func}, X)." | swipl -s #{EVAL_PL} 2>&1 | grep "X ="`.strip.split('X = ')[1] - puts " nrl: #{natural_language_string}" - puts "flat mrl: #{flat_mrl}" - puts " func: #{func}" - puts " output: #{res}" - return res==reference_output, func, res -end +CDEC = "/toolbox/cdec-dtrain/bin/cdec" +# execute +def exec natural_language_string, reference_output, no_output=false + func = `#{SMT_SEMPARSE} "#{natural_language_string}"`.strip + output = `echo "execute_funql_query(#{func}, X)." | swipl -s #{EVAL_PL} 2>&1 | grep "X ="`.strip.split('X = ')[1] + puts " nrl: #{natural_language_string}" if !no_output + puts " mrl: #{func}" if !no_output + puts " output: #{output}" if !no_output + puts " correct?: #{output==reference_output}" if !no_output + return output==reference_output, func, output +end # decoder interaction/translations class Translation @@ -44,15 +44,12 @@ class Translation end end -CDEC = "/toolbox/cdec-dtrain/bin/cdec -r" def predict_translation s, k, ini, w - cmd = " echo \"#{s}\" | #{CDEC} -c #{ini} -k #{k} -w #{w} 2>/dev/null" - o, s = Open3.capture2(cmd) + o, s = Open3.capture2 "echo \"#{s}\" | #{CDEC} -c #{ini} -r -k #{k} -w #{w} 2>/dev/null" j = -1 return o.split("\n").map{|i| j+=1; Translation.new(i, j)} end - # scoring (per-sentence BLEU) def ngrams_it(s, n, fix=false) a = s.strip.split @@ -64,70 +61,59 @@ def ngrams_it(s, n, fix=false) } end -def brevity_penalty h, r - a = h.split - b = r.split +def brevity_penalty hypothesis, reference + a = hypothesis.split; b = reference.split return 1.0 if a.size>b.size return Math.exp(1.0 - b.size.to_f/a.size); end -def per_sentence_bleu h, r, n=4 - h_ng = {} - r_ng = {} - (1).upto(n) { |i| h_ng[i] = []; r_ng[i] = [] } - ngrams_it(h, n) { |i| h_ng[i.size] << i } - ngrams_it(r, n) { |i| r_ng[i.size] << i } - m = [n,r.split.size].min +def per_sentence_bleu hypothesis, reference, n=4 + h_ng = {}; r_ng = {} + (1).upto(n) {|i| h_ng[i] = []; r_ng[i] = []} + ngrams_it(hypothesis, n) {|i| h_ng[i.size] << i} + ngrams_it(reference, n) {|i| r_ng[i.size] << i} + m = [n, reference.split.size].min weight = 1.0/m add = 0.0 sum = 0 (1).upto(m) { |i| counts_clipped = 0 counts_sum = h_ng[i].size - h_ng[i].uniq.each { |j| counts_clipped += r_ng[i].count(j) } + h_ng[i].uniq.each {|j| counts_clipped += r_ng[i].count(j)} add = 1.0 if i >= 2 sum += weight * Math.log((counts_clipped + add)/(counts_sum + add)); } - return brevity_penalty(h,r) * Math.exp(sum) + return brevity_penalty(hypothesis, reference) * Math.exp(sum) end -def score_translations a, reference - a.each_with_index { |i,j| - i.score = per_sentence_bleu i.s, reference - } +def score_translations list_of_translations, reference + list_of_translations.each { |i| i.score = per_sentence_bleu i.s, reference} end -### /scoring - - -### hope and fear -def hope_and_fear a, act='hope' +# hope and fear +def hope_and_fear kbest, action max = -1.0/0 max_idx = -1 - a.each_with_index { |i,j| - if act=='hope' && i.model + i.score > max + kbest.each_with_index { |i,j| + if action=='hope' && i.model + i.score > max max_idx = j; max = i.model + i.score end - if act=='fear' && i.model - i.score > max + if action=='fear' && i.model - i.score > max max_idx = j; max = i.model - i.score end } - return a[max_idx] + return kbest[max_idx] end -### /hope and fear - - -### update -def update w, hope, fear - w = w + (hope.f - fear.f) +# update +def update w, hope, fear, eta + diff = hope.f - fear.f + diff *= eta + w += diff return w end -### /update - - -### weights +# weights class NamedSparseVector attr_accessor :h @@ -199,41 +185,59 @@ class NamedSparseVector @h.to_s end + def length + Math.sqrt(@h.values.map{|i|i*i}.inject(:+)) + end + + def normalize! + l = length + @h.each_pair { |k,v| + @h[k] = v/l + } + end + def size @h.keys.size end end -### /weights +# map models score to [0,1] +def adj_model kbest, factor + min = kbest.map{|i|i.model}.min + max = kbest.map{|i|i.model}.max + kbest.each {|i| i.model = factor*((i.model-min)/(max-min))} +end -def test opts - w = NamedSparseVector.new - w.from_file opts[:init_weights] - input = File.new(opts[:input], 'r').readlines.map{|i|i.strip} - references = File.new(opts[:references], 'r').readlines.map{|i|i.strip} - f = File.new('weights.tmp', 'w+') - f.write w.to_file - f.close - kbest = predict_translation input[0], opts[:k], 'weights.tmp' - score_translations kbest, references[0] - kbest.each_with_index { |i,j| - puts "#{i.rank} #{i.s} #{i.model} #{i.score}" - } - puts - puts "hope" - hope = hope_and_fear kbest, 'hope' - puts "#{hope.rank} #{hope.s} #{hope.model} #{hope.score}" - puts "fear" - fear = hope_and_fear kbest, 'fear' - puts "#{fear.rank} #{fear.s} #{fear.model} #{fear.score}" +class Stats + def initialize name + @name = name + @with_parse = 0.0 + @with_output = 0.0 + @correct_output = 0.0 + end + + def update feedback, func, output + @with_parse +=1 if func!="None" + @with_output +=1 if output!="null" + @correct_output += 1 if feedback==true + end + + def print total +<<-eos + [#{@name}] + with parse #{((@with_parse/total)*100).round 2} abs:#{@with_parse} + with output #{((@with_output/total)*100).round 2} abs:#{@with_output} +with correct output #{((@correct_output/total)*100).round 2} abs:#{@correct_output} +eos + end end -def adj_model a - min = a.map{|i|i.model}.min - max = a.map{|i|i.model}.max - a.each { |i| - i.model = (i.model-min)/(max-min) - } +def _print rank, string, model, score + puts "rank=#{rank} string='#{string}' model=#{model} score=#{score}" +end + +def bag_of_words s, stopwords=[] + s.split.uniq.sort.reject{|v| stopwords.include? v} end def main @@ -241,83 +245,175 @@ def main opt :k, "k", :type => :int, :required => true opt :input, "'foreign' input", :type => :string, :required => true opt :references, "(parseable) references", :type => :string, :required => true - opt :gold, "gold standard parser output", :type => :string, :require => true - opt :gold_mrl, "gold standard mrl", :type => :string, :short => '-h', :require => true + opt :gold, "gold output", :type => :string, :require => true + opt :gold_mrl, "gold parse", :type => :string, :short => '-h', :require => true opt :init_weights, "initial weights", :type => :string, :required => true, :short => '-w' opt :cdec_ini, "cdec config file", :type => :string, :default => './cdec.ini' + opt :eta, "learning rate", :type => :float, :default => 0.01 + opt :no_update, "don't update weights", :type => :bool, :default => false + opt :stop_after, "stop after x examples", :type => :int, :default => -1 + opt :output_weights, "output file for final weights", :type => :string, :required => true + opt :scale_model, "scale model score by this factor", :type => :float, :default => 1.0, :short => '-m' + opt :normalize, "normalize weights after each update", :type => :bool, :default => false, :short => '-l' + opt :print_kbests, "print full kbest lists", :type => :bool, :default => false, :short => '-j' + opt :hope2, "select hope from the first X items in kbest that executes", :type => :int, :default => 0, :short => '-x' + opt :fear2, "skip example if fear executes", :type => :bool, :default => false end - input = File.new(opts[:input], 'r').readlines.map{|i|i.strip} + puts "cfg" + opts.each_pair {|k,v| puts "#{k}\t#{v}"} + puts + + input = File.new(opts[:input], 'r').readlines.map{|i|i.strip} references = File.new(opts[:references], 'r').readlines.map{|i|i.strip} - gold = File.new(opts[:gold], 'r').readlines.map{|i|i.strip} - gold_mrl = File.new(opts[:gold_mrl], 'r').readlines.map{|i|i.strip} + gold = File.new(opts[:gold], 'r').readlines.map{|i|i.strip} + gold_mrl = File.new(opts[:gold_mrl], 'r').readlines.map{|i|i.strip} + + stopwords = File.new('stopwords.en', 'r').readlines.map{|i|i.strip} # init weights w = NamedSparseVector.new w.from_file opts[:init_weights] - - positive_feedback = 0 without_translations = 0 - with_proper_parse = 0 - with_output = 0 - count = 0 + count = 0 + top1_stats = Stats.new 'top1' + hope_stats = Stats.new 'hope' + fear_stats = Stats.new 'fear' + refs_stats = Stats.new 'refs' + type1_updates = 0 + type2_updates = 0 + top1_hit = 0 + top1_variant = 0 + top1_real_variant = 0 + hope_hit = 0 + hope_variant = 0 + hope_real_variant = 0 + kbest_sz = 0 + last_wf = '' input.each_with_index { |i,j| count += 1 # write current weights to file tmp_file = Tempfile.new('rampion') tmp_file_path = tmp_file.path + last_wf = tmp_file.path tmp_file.write w.to_file tmp_file.close # get kbest list for current input kbest = predict_translation i, opts[:k], opts[:cdec_ini], tmp_file_path - if kbest.size==0 # FIXME: shouldnt happen + kbest_sz += kbest.size + if kbest.size==0 without_translations += 1 next end score_translations kbest, references[j] - adj_model kbest + if opts[:print_kbests] + puts "KBEST" + kbest.each_with_index { |k,l| + _print l, k.s, k.model, k.score + } + end + adj_model kbest, opts[:scale_model] # get feedback - - puts "----top1" - puts "0 #{kbest[0].s} #{kbest[0].model} #{kbest[0].score}" + puts "EXAMPLE #{j}" + puts "GOLD MRL: #{gold_mrl[j]}" + puts "GOLD OUTPUT #{gold[j]}" + # fear + fear = hope_and_fear kbest, 'fear' + if opts[:fear2] + f, g, o = exec fear.s, gold[j], true + if f + puts "FEAR EXECUTED, skipping example\n\n" + next + end + end + # top1 + puts "---top1" + _print 0, kbest[0].s, kbest[0].model, kbest[0].score feedback, func, output = exec kbest[0].s, gold[j] - with_proper_parse +=1 if func!="None" - with_output +=1 if output!="null" - positive_feedback += 1 if feedback==true - hope = ''; fear = '' + # hope2 + parses = [] + if opts[:hope2]>0 + already_seen = {} + puts "<<KBEST EXEC" + (1).upto([opts[:hope2]-1, kbest.size-1].min) { |l| + f, g, o = exec kbest[l].s, gold[j], true + words = bag_of_words kbest[l].s, stopwords + parses << f + puts "#{f} | #{l} | #{kbest[l].s} #{words.to_s}" if !already_seen.has_key? words + already_seen[words] = true + } + puts ">>>" + end + top1_stats.update feedback, func, output + # hope & update + ref_words = bag_of_words references[j], stopwords + hope = nil if feedback==true - puts "'#{kbest[0].s}'" - references[j] = kbest[0].s + if kbest[0].s == references[j] + top1_hit +=1 + else + top1_variant += 1 + top1_real_variant += 1 if bag_of_words(kbest[0].s,stopwords)!=ref_words + end + #references[j] = kbest[0].s hope = kbest[0] + type1_updates += 1 else - hope = hope_and_fear kbest, 'hope' + if opts[:hope2]>0 + c=-1; found = parses.detect{|b| c+=1; b } + hope = kbest[c] if found + if !found + puts "NO GOOD HOPE, skipping example\n\n" + next + end + else + hope = hope_and_fear kbest, 'hope' + end + if hope.s == references[j] + hope_hit += 1 + else + hope_variant += 1 + hope_real_variant += 1 if bag_of_words(hope.s,stopwords)!=ref_words + end + type2_updates += 1 end - fear = hope_and_fear kbest, 'fear' - - puts "----hope" - puts "#{hope.rank} #{hope.s} #{hope.model} #{hope.score}" - exec hope.s, gold[j] - - puts "----fear" - puts "#{fear.rank} #{fear.s} #{fear.model} #{fear.score}" - exec fear.s, gold[j] - - puts "----reference" - puts "// #{references[j]} // 1.0" - exec references[j], gold[j] - puts "GOLD MRL: #{gold_mrl[j]}" - puts "GOLD OUTPUT #{gold[j]}" + + puts "---hope" + _print hope.rank, hope.s, hope.model, hope.score + feedback, func, output = exec hope.s, gold[j] + hope_stats.update feedback, func, output + puts "---fear" + _print fear.rank, fear.s, fear.model, fear.score + feedback, func, output = exec fear.s, gold[j] + fear_stats.update feedback, func, output + puts "---reference" + _print 'x', references[j], 'x', 1.0 + feedback, func, output = exec references[j], gold[j] + refs_stats.update feedback, func, output puts - w = update w, hope, fear + w = update w, hope, fear, opts[:eta] if !opts[:no_update] + w.normalize! if opts[:normalize] + break if opts[:stop_after]>0 && (j+1)==opts[:stop_after] } + FileUtils::cp(last_wf, opts[:output_weights]) puts "#{count} examples" - puts "#{((positive_feedback.to_f/count)*100).round 2}% with positive feedback (abs: #{positive_feedback})" - puts "#{((with_proper_parse.to_f/count)*100).round 2}% with proper parse (abs: #{with_proper_parse})" - puts "#{((with_output.to_f/count)*100).round 2}% with output (abs: #{with_output})" + puts " type1 updates: #{type1_updates}" + puts " type2 updates: #{type2_updates}" + puts " top1 hits: #{top1_hit}" + puts " top1 variant: #{top1_variant}" + puts "top1 real variant: #{top1_real_variant}" + puts " hope hits: #{hope_hit}" + puts " hope variant: #{hope_variant}" + puts "hope real variant: #{hope_real_variant}" + puts " kbest size: #{(kbest_sz/count).round 2}" puts "#{((without_translations.to_f/count)*100).round 2}% without translations (abs: #{without_translations})" + puts top1_stats.print count + puts hope_stats.print count + puts fear_stats.print count + puts refs_stats.print count end |