diff options
| author | Patrick Simianer <simianer@cl.uni-heidelberg.de> | 2014-01-08 18:11:48 +0100 | 
|---|---|---|
| committer | Patrick Simianer <simianer@cl.uni-heidelberg.de> | 2014-01-08 18:11:48 +0100 | 
| commit | 8f3b6a8889bc1b8a18f14e947360e0a8bee808b7 (patch) | |
| tree | 11038cd228b65f0384969aa3d70c425217409c13 /rampfion.rb | |
| parent | cb1042dd0d7d292d343b2c89f02a174f013de9c5 (diff) | |
too much to say
Diffstat (limited to 'rampfion.rb')
| -rwxr-xr-x | rampfion.rb | 657 | 
1 files changed, 657 insertions, 0 deletions
diff --git a/rampfion.rb b/rampfion.rb new file mode 100755 index 0000000..ce40917 --- /dev/null +++ b/rampfion.rb @@ -0,0 +1,657 @@ +#!/usr/bin/env ruby + +require 'trollop' +require 'tempfile' +require 'open3' +require 'memcached' +require 'timeout' + + +SMT_SEMPARSE = 'python /workspace/grounded/smt-semparse-cp/decode_sentence.py /workspace/grounded/smt-semparse-cp/working/full_dataset 2>/dev/null' +EVAL_PL = '/workspace/grounded/wasp-1.0/data/geo-funql/eval/eval.pl' +CDEC = "/toolbox/cdec-dtrain/bin/cdec" + +$cache = Memcached.new("localhost:11211") + +# the semantic parser hangs sometimes +def spawn_with_timeout cmd, t=4, debug=false +  puts cmd if debug +  pipe_in, pipe_out = IO.pipe +  pid = Process.spawn(cmd, :out => pipe_out) +  begin +    Timeout.timeout(t) { Process.wait pid } +  rescue Timeout::Error +    return "" +    # accept the zombies +    #Process.kill('TERM', pid) +  end +  pipe_out.close +  return pipe_in.read +end + +# execute +def exec natural_language_string, reference_output, no_output=false +  func = nil +  output = nil +  feedback = nil +  key_prefix = natural_language_string.encode("ASCII", :invalid => :replace, :undef => :replace, :replace => "?").gsub(/ /,'_') +  begin +    func = $cache.get key_prefix+"__FUNC" +    output = $cache.get key_prefix+"__OUTPUT" +    feedback = $cache.get key_prefix+"__FEEDBACK" +  rescue Memcached::NotFound +    func   = spawn_with_timeout("#{SMT_SEMPARSE} \"#{natural_language_string}\"").strip +    output = spawn_with_timeout("echo \"execute_funql_query(#{func}, X).\" | swipl -s #{EVAL_PL} 2>&1  | grep \"X =\"").strip.split('X = ')[1] +    feedback = output==reference_output +    begin +      $cache.set key_prefix+"__FUNC", func +      $cache.set key_prefix+"__OUTPUT", output +      $cache.set key_prefix+"__FEEDBACK", feedback +    rescue SystemExit, Interrupt +      $cache.delete key_prefix+"__FUNC" +      $cache.delete key_prefix+"__OUTPUT" +      $cache.delete key_prefix+"__FEEDBACK" +    end +  end +  puts "        nrl: #{natural_language_string}" if !no_output +  puts "        mrl: #{func}" if !no_output +  puts "     output: #{output}" if !no_output +  puts "   correct?: #{feedback}" if !no_output +  return feedback, func, output +end + +# decoder interaction/translations +class Translation +  attr_accessor :s, :f, :rank, :model, :score + +  def initialize kbest_line, rank=-1 +    a = kbest_line.split ' ||| ' +    @s = a[1].strip +    h = {} +    a[2].split.each { |i| +      name, value = i.split '=' +      value = value.to_f +      h[name] = value +    } +    @f = NamedSparseVector.new h +    @rank = rank +    @model = a[3].to_f +    @score = -1.0 +  end + +  def to_s +    "#{@rank} ||| #{@s} ||| #{@model} ||| #{@score} ||| #{@f.to_s}" +  end +end + +def predict_translation s, k, ini, w +  o, s = Open3.capture2 "echo \"#{s}\" | #{CDEC} -c #{ini} -r -k #{k} -w #{w} 2>/dev/null" +  j = -1 +  return o.split("\n").map{|i| j+=1; Translation.new(i, j)} +end + +# scoring (per-sentence BLEU) +def ngrams_it(s, n, fix=false) +  a = s.strip.split +  a.each_with_index { |tok, i| +    tok.strip! +    0.upto([n-1, a.size-i-1].min) { |m| +      yield a[i..i+m] if !(fix||(a[i..i+m].size>n)) +    } +  } +end + +def brevity_penalty hypothesis, reference +  a = hypothesis.split; b = reference.split +  return 1.0 if a.size>b.size +  return Math.exp(1.0 - b.size.to_f/a.size); +end + +def per_sentence_bleu hypothesis, reference, n=4 +  h_ng = {}; r_ng = {} +  (1).upto(n) {|i| h_ng[i] = []; r_ng[i] = []} +  ngrams_it(hypothesis, n) {|i| h_ng[i.size] << i} +  ngrams_it(reference, n) {|i| r_ng[i.size] << i} +  m = [n, reference.split.size].min +  weight = 1.0/m +  add = 0.0 +  sum = 0 +  (1).upto(m) { |i| +    counts_clipped = 0 +    counts_sum = h_ng[i].size +    h_ng[i].uniq.each {|j| counts_clipped += r_ng[i].count(j)} +    add = 1.0 if i >= 2 +    sum += weight * Math.log((counts_clipped + add)/(counts_sum + add)); +  } +  return brevity_penalty(hypothesis, reference) * Math.exp(sum) +end + +def score_translations list_of_translations, reference +  list_of_translations.each { |i| i.score = per_sentence_bleu i.s, reference} +end + +# hope and fear +def hope_and_fear kbest, action +  max = -1.0/0 +  max_idx = -1 +  kbest.each_with_index { |i,j| +  if action=='hope' && i.model + i.score > max +    max_idx = j; max = i.model + i.score +  end +  if action=='fear' && i.model - i.score > max +    max_idx = j; max = i.model - i.score +  end +  } +  return kbest[max_idx] +end + +# update +def update w, hope, fear, eta +  diff = hope.f - fear.f +  diff *= eta +  w += diff +  return w +end + +# weights +class NamedSparseVector +  attr_accessor :h + +  def initialize init=nil +    @h = {} +    @h = init if init +    @h.default = 0.0 +  end + +  def + other +    new_h = Hash.new +    new_h.update @h +    ret = NamedSparseVector.new new_h +    other.each_pair { |k,v| ret[k]+=v } +    return ret +  end + +  def from_file fn +    f = File.new(fn, 'r') +    while line = f.gets +      name, value = line.strip.split +      value = value.to_f +      @h[name] = value +    end +  end + +  def to_file +    s = [] +    @h.each_pair { |k,v| s << "#{k} #{v}" } +    s.join("\n")+"\n" +  end + +  def - other +    new_h = Hash.new +    new_h.update @h +    ret = NamedSparseVector.new new_h +    other.each_pair { |k,v| ret[k]-=v } +    return ret +  end + +  def * scalar +    raise ArgumentError, "Arg is not numeric #{scalar}" unless scalar.is_a? Numeric +    ret = NamedSparseVector.new +    @h.keys.each { |k| ret[k] = @h[k]*scalar } +    return ret +  end + +  def dot other +    sum = 0.0 +    @h.each_pair { |k,v| +      sum += v * other[k] +    } +    return sum +  end + +  def [] k +    @h[k] +  end + +  def []= k, v +    @h[k] = v +  end + +  def each_pair +    @h.each_pair { |k,v| yield k,v } +  end + +  def to_s +    @h.to_s +  end + +  def length +    Math.sqrt(@h.values.map{|i|i*i}.inject(:+)) +  end + +  def normalize! +    l = length +    @h.each_pair { |k,v| +      @h[k] = v/l +    } +  end + +  def size +    @h.keys.size +  end +end + +# map models score to [0,1] +def adj_model kbest, factor +  min = kbest.map{|i|i.model}.min +  max = kbest.map{|i|i.model}.max +  kbest.each {|i| i.model = factor*((i.model-min)/(max-min))} +end + +class Stats +  def initialize name +    @name = name +    @with_parse = 0.0 +    @with_output       = 0.0 +    @correct_output    = 0.0 +  end + +  def update feedback, func, output +    @with_parse +=1 if func!="None"&&func!='' +    @with_output +=1 if output!="null"&&output!='' +    @correct_output += 1 if feedback==true +  end + +  def print total +    without_parse = total-@with_parse +<<-eos +  [#{@name}] +         #{@name} with parse #{((@with_parse/total)*100).round 2}  abs:#{@with_parse} +        #{@name} with output #{((@with_output/total)*100).round 2} abs:#{@with_output} +#{@name} with correct output #{((@correct_output/total)*100).round 2} adj:#{((@correct_output/(total-without_parse))*100).round 2} abs:#{@correct_output} +eos +  end +end + +def _print rank, string, model, score +    puts "rank=#{rank} string='#{string}' model=#{model} score=#{score}" +end + +def bag_of_words s, stopwords=[] +  s.split.uniq.sort.reject{|v| stopwords.include? v} +end + +def gethopefear_standard kbest, feedback +  hope = fear = nil +  type1 = type2 = false +  if feedback == true +    hope = kbest[0] +    type1 = true +  else +    hope = hope_and_fear(kbest, 'hope') +    type2 = true +  end +  fear = hope_and_fear(kbest, 'fear') +  return hope, fear, false, type1, type2 +end + +def gethopefear_fear_no_exec kbest, feedback, gold, max +  hope = fear = nil +  type1 = type2 = false +  if feedback == true +    hope = kbest[0] +    type1 = true +  else +    hope = hope_and_fear(kbest, 'hope') +    type2 = true +  end +  kbest.sort{|x,y|(y.model+y.score)<=>(x.model+x.score)}.each_with_index { |k,i| +    break if i==max +    if !exec(k.s, gold, true)[0] +       fear = k +       break +    end +  } +  skip=true if !fear +  return hope, fear, skip, type1, type2 +end + +def gethopefear_fear_no_exec_skip kbest, feedback, gold +  hope = fear = nil +  type1 = type2 = false +  if feedback == true +    hope = kbest[0] +    type1 = true +  else +    hope = hope_and_fear(kbest, 'hope') +    type2 = true +  end +  fear = hope_and_fear(kbest, 'fear') +  skip = exec(fear.s, gold, true)[0] +  return hope, fear, skip, type1, type2 +end + +def gethopefear_fear_no_exec_hope_exec kbest, feedback, gold, max +  hope = fear = nil; hope_idx = 0 +  type1 = type2 = false +  sorted_kbest = kbest.sort{|x,y|(y.model+y.score)<=>(x.model+x.score)} +  if feedback == true +    hope = kbest[0] +    type1 = true +  else +    sorted_kbest.each_with_index { |k,i| +      next if i==0 +      break if i==max +      if exec(k.s, gold, true)[0] +        hope_idx = i +        hope = k +        break +      end +    } +    type2 = true +  end +  sorted_kbest.each_with_index { |k,i| +    break if i>(kbest.size-(hope_idx+1))||i==max +    if !exec(k.s, gold, true)[0] +      fear = k +      break +    end +  } +  skip = true if !hope||!fear +  return hope, fear, skip, type1, type2 +end + +def gethopefear_only_exec kbest, feedback, gold, max, own_reference=nil +  hope = fear = nil; hope_idx = 0; new_reference = nil +  type1 = type2 = false +  if feedback == true +    hope = kbest[0] +    new_reference = hope +    type1 = true +  elsif own_reference +    hope = own_reference +    type1 = true +  else +    kbest.each_with_index { |k,i| +      next if i==0 +      break if i==max +      if exec(k.s, gold, true)[0] +        hope_idx = i +        hope = k +        break +      end +    } +    type2 = true +  end +  kbest.each_with_index { |k,i| +    next if i==0||i==hope_idx +    break if i==max +    if !exec(k.s, gold, true)[0] +      fear = k +      break +    end +  } +  skip = true if !hope||!fear +  return hope, fear, skip, type1, type2, new_reference +end + +def gethopefear_only_exec_simple kbest, feedback, gold, max, own_reference=nil +  hope = fear = nil; hope_idx = 0; new_reference = nil +  type1 = type2 = false +  if feedback == true +    hope = kbest[0] +    new_reference = hope +    type1 = true +  elsif own_reference +    hope = own_reference +    type1 = true +  else +    kbest.each_with_index { |k,i| +      next if i==0 +      break if i==max +      if exec(k.s, gold, true)[0] +        hope_idx = i +        hope = k +        break +      end +    } +    type2 = true +  end +  kbest.each_with_index { |k,i| +    next if i==0||i==hope_idx +    break if i==max +    if !exec(k.s, gold, true)[0] +      fear = k +      break +    end +  } +  skip = true if !hope||!fear +  return hope, fear, skip, type1, type2, new_reference +end + +def gethopefear_rampion kbest, reference +  hope = fear = nil +  type1 = type2 = false +  if kbest[0].s == reference +    hope = kbest[0] +    fear = hope_and_fear(kbest, 'fear') +    type1 = true +  else +    hope = hope_and_fear(kbest, 'hope') +    fear = kbest[0] +    type2 = true +  end +  return hope, fear, false, type1, type2 +end + +def main +  opts = Trollop::options do +    # data +    opt :k, "k", :type => :int, :default => 10000 +    opt :hope_fear_max, "asdf",  :type => :int, :default => 32, :short => '-q' +    opt :input, "'foreign' input", :type => :string, :required => true +    opt :references, "(parseable) references", :type => :string, :required => true +    opt :gold, "gold output", :type => :string, :require => true +    opt :gold_mrl, "gold parse", :type => :string, :short => '-h', :require => true +    opt :init_weights, "initial weights", :type => :string, :required => true, :short => '-w' +    opt :cdec_ini, "cdec config file", :type => :string, :default => './cdec.ini' +    # output +    opt :debug, "debug output", :type => :bool, :default => false +    opt :output_weights, "output file for final weights", :type => :string, :required => true +    opt :stop_after, "stop after x examples", :type => :int, :default => -1 +    opt :print_kbests, "print full kbest lists", :type => :bool, :default => false, :short => '-l' +    # important parameters +    opt :eta, "learning rate", :type => :float, :default => 0.01 +    opt :iterate, "iteration X epochs", :type => :int, :default => 1, :short => '-j' +    opt :variant, "standard, rampion, fear_no_exec, fear_no_exec_skip, fear_no_exec_hope_exec, only_exec", :default => 'standard' +    # misc parameters +    opt :scale_model, "scale model score by this factor", :type => :float, :default => 1.0, :short => '-m' +    opt :normalize, "normalize weights after each update", :type => :bool, :default => false, :short => '-n' +    opt :skip_on_no_proper_gold, "skip if the reference didn't produce a proper gold output", :default => false, :short => '-x' +    opt :no_update, "don't update weights", :type => :bool, :default => false, :short => '-y' +  end +  # output configuration +  puts "cfg" +  opts.each_pair {|k,v| puts "#{k}=#{v}"} +  puts +  # read files +  input      = File.readlines(opts[:input], :encoding=>'utf-8').map{|i|i.strip} +  references = File.readlines(opts[:references], :encoding=>'utf-8').map{|i|i.strip} +  gold       = File.readlines(opts[:gold], :encoding=>'utf-8').map{|i|i.strip} +  gold_mrl   = File.readlines(opts[:gold_mrl], :encoding=>'utf-8').map{|i|i.strip} +  stopwords  = File.readlines('d/stopwords.en', :encoding=>'utf-8').map{|i|i.strip} +  # only_exec: new refs +  own_references = nil +  own_references = references.map{|i|nil} if opts[:variant]== 'only_exec' +  # init weights +  w = NamedSparseVector.new +  w.from_file opts[:init_weights] +  last_wf = '' +# iterate +opts[:iterate].times { |iter| +  # numerous counters +  without_translations  = 0 +  no_proper_gold_output = 0 +  count                 = 0 +  top1_stats = Stats.new 'top1' +  hope_stats = Stats.new 'hope' +  fear_stats = Stats.new 'fear' +  refs_stats = Stats.new 'refs' +  type1_updates     = 0 +  type2_updates     = 0 +  top1_hit          = 0 +  top1_variant      = 0 +  top1_real_variant = 0 +  hope_hit          = 0 +  hope_variant      = 0 +  hope_real_variant = 0 +  kbest_sz          = 0 +  # for each example +  input.each_with_index { |i,j| +    count += 1 +    # write current weights to file +    tmp_file = Tempfile.new('rampion') +    tmp_file_path = tmp_file.path +    last_wf = tmp_file.path +    tmp_file.write w.to_file +    tmp_file.close +    # get kbest list for current input +    kbest = predict_translation i, opts[:k], opts[:cdec_ini], tmp_file_path +    kbest_sz += kbest.size +    # output +    puts "EXAMPLE #{j}" +    puts "GOLD MRL: #{gold_mrl[j]}" +    puts "GOLD OUTPUT #{gold[j]}" +    # skip if no translation could be produced +    if kbest.size == 0 +      without_translations += 1 +      puts "NO MT OUTPUT, skipping example\n\n" +      next +    end +    # no  proper gold +    if gold[j] == '[]' || gold[j] == '[...]' || gold[j] == '[].' +      no_proper_gold_output += 1 +      if opts[:skip_on_no_proper_gold] +        puts "NO PROPER GOLD OUTPUT, skipping example\n\n" +        next +      end +    end +    # score kbest list +    score_translations kbest, references[j] +    # print kbest list +    if opts[:print_kbests] +      puts "<<<KBEST" +      kbest.each_with_index { |k,l| +        _print l, k.s, k.model, k.score +      } +      puts ">>>" +    end +    # adjust model scores to fit in [0,1] +    adj_model kbest, opts[:scale_model] +    # top1 +    puts "---top1" +    puts "TOP1 TRANSLATION: #{kbest[0].s}" if iter+1==opts[:iterate] +    _print 0, kbest[0].s, kbest[0].model, kbest[0].score +    feedback, func, output = exec kbest[0].s, gold[j] +    top1_stats.update feedback, func, output +    # reference as bag of words +    ref_words = bag_of_words references[j], stopwords +    # hope and fear +    hope = fear = new_reference = nil +    type1 = type2 = skip = false +    if    opts[:variant] == 'standard' +      hope, fear, skip, type1, type2 = gethopefear_standard kbest, feedback +    elsif opts[:variant] == 'rampion' +      hope, fear, skip, type1, type2 = gethopefear_rampion kbest, references[j] +    elsif opts[:variant] == 'fear_no_exec_skip' +      hope, fear, skip, type1, type2 = gethopefear_fear_no_exec_skip kbest, feedback, gold[j] +    elsif opts[:variant] == 'fear_no_exec' +      hope, fear, skip, type1, type2 = gethopefear_fear_no_exec kbest, feedback, gold[j], opts[:hope_fear_max] +    elsif opts[:variant] == 'fear_no_exec_hope_exec' +      hope, fear, skip, type1, type2 = gethopefear_fear_no_exec_hope_exec kbest, feedback, gold[j], opts[:hope_fear_max] +    elsif opts[:variant] == 'only_exec' +      hope, fear, skip, type1, type2, new_reference = gethopefear_only_exec kbest, feedback, gold[j], opts[:hope_fear_max], own_references[j] +    else +      puts "no such hope/fear variant" +      exit 1 +    end +    # new reference (only_exec) +    if new_reference +      own_references[j] = new_reference +    end +    # type1/type2 +    type1_updates+=1 if type1 +    type2_updates+=1 if type2 +    # top1/hope hit +    if kbest[0].s == references[j] +      top1_hit += 1 +    else +      top1_variant += 1 +      top1_real_variant += 1 if bag_of_words(kbest[0].s,stopwords)!=ref_words +    end +    if hope&&hope.s == references[j] +      hope_hit += 1 +    elsif hope +      hope_variant += 1 +      hope_real_variant += 1 if bag_of_words(hope.s,stopwords)!=ref_words +    end +    # output info for current example +    puts "---hope" +    if hope +      _print hope.rank, hope.s, hope.model, hope.score +      feedback, func, output =  exec hope.s, gold[j] +      hope_stats.update feedback, func, output +    end +    puts "---fear" +    if fear +      _print fear.rank, fear.s, fear.model, fear.score +      feedback, func, output = exec fear.s, gold[j] +      fear_stats.update  feedback, func, output +    end +    puts "---reference" +    _print 'x', references[j], 'x', 1.0 +    feedback, func, output = exec references[j], gold[j] +    refs_stats.update feedback, func, output +    # skip example? +    if skip||!hope||!fear +      puts "NO GOOD FEAR/HOPE, skipping example\n\n" +      next +    end +    puts +    # update +    w = update w, hope, fear, opts[:eta] if !opts[:no_update] +    # normalize weight vector to length 1 +    w.normalize! if opts[:normalize] +    # stopx after x examples +    break if opts[:stop_after]>0 && (j+1)==opts[:stop_after] +  } +  # keep weight files for each iteration +  if opts[:iterate] > 1 +    FileUtils::cp(last_wf, "#{opts[:output_weights]}.#{iter}") +  else +    FileUtils::cp(last_wf, opts[:output_weights]) +  end +  # output stats +  puts "iteration ##{iter+1}/#{opts[:iterate]}" +  puts "#{count} examples" +  puts "    type1 updates: #{type1_updates}" +  puts "    type2 updates: #{type2_updates}" +  puts "        top1 hits: #{top1_hit}" +  puts "     top1 variant: #{top1_variant}" +  puts "top1 real variant: #{top1_real_variant}" +  puts "        hope hits: #{hope_hit}" +  puts "     hope variant: #{hope_variant}" +  puts "hope real variant: #{hope_real_variant}" +  puts "       kbest size: #{(kbest_sz/count).round 2}" +  puts "#{((without_translations.to_f/count)*100).round 2}% without translations (abs: #{without_translations})" +  puts "#{((no_proper_gold_output.to_f/count)*100).round 2}% no good gold output (abs: #{no_proper_gold_output})" +  puts top1_stats.print count +  puts hope_stats.print count +  puts fear_stats.print count +  puts refs_stats.print count +} +end + + +main +  | 
