From bb023a7d1ff78009d0fb00817310c0dea96277c4 Mon Sep 17 00:00:00 2001
From: Patrick Simianer <simianer@cl.uni-heidelberg.de>
Date: Wed, 18 Dec 2013 14:50:39 +0100
Subject: good version

---
 rampion_with_feedback.rb | 334 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 215 insertions(+), 119 deletions(-)

diff --git a/rampion_with_feedback.rb b/rampion_with_feedback.rb
index 70bf463..a69a922 100755
--- a/rampion_with_feedback.rb
+++ b/rampion_with_feedback.rb
@@ -5,20 +5,20 @@ require 'tempfile'
 require 'open3'
 
 
-# execute
-SMT_SEMPARSE = '/workspace/grounded/mosesdecoder/moses-chart-cmd/bin/gcc-4.7/release/debug-symbols-on/link-static/threading-multi/moses_chart -f /workspace/grounded/smt-semparse/latest/model/moses.ini 2>/dev/null'
+SMT_SEMPARSE = 'python /workspace/grounded/smt-semparse-cp/decode_sentence.py /workspace/grounded/smt-semparse-cp/working/full_dataset'
 EVAL_PL = '/workspace/grounded/wasp-1.0/data/geo-funql/eval/eval.pl'
-def exec natural_language_string, reference_output
-  flat_mrl = `echo "#{natural_language_string}" | ./stem.py | #{SMT_SEMPARSE}`.strip
-  func = `echo "#{flat_mrl}" | ./functionalize.py 2>/dev/null`.strip
-  res = `echo "execute_funql_query(#{func}, X)." | swipl -s #{EVAL_PL} 2>&1  | grep "X ="`.strip.split('X = ')[1]
-  puts "     nrl: #{natural_language_string}"
-  puts "flat mrl: #{flat_mrl}"
-  puts "    func: #{func}"
-  puts "  output: #{res}"
-  return res==reference_output, func, res
-end
+CDEC = "/toolbox/cdec-dtrain/bin/cdec"
 
+# execute
+def exec natural_language_string, reference_output, no_output=false
+  func   = `#{SMT_SEMPARSE} "#{natural_language_string}"`.strip
+  output = `echo "execute_funql_query(#{func}, X)." | swipl -s #{EVAL_PL} 2>&1  | grep "X ="`.strip.split('X = ')[1]
+  puts "        nrl: #{natural_language_string}" if !no_output
+  puts "        mrl: #{func}" if !no_output
+  puts "     output: #{output}" if !no_output
+  puts "   correct?: #{output==reference_output}" if !no_output
+  return output==reference_output, func, output
+end
 
 # decoder interaction/translations
 class Translation
@@ -44,15 +44,12 @@ class Translation
   end
 end
 
-CDEC = "/toolbox/cdec-dtrain/bin/cdec -r"
 def predict_translation s, k, ini, w
-  cmd = " echo \"#{s}\" | #{CDEC} -c #{ini} -k #{k} -w #{w} 2>/dev/null"
-  o, s = Open3.capture2(cmd)
+  o, s = Open3.capture2 "echo \"#{s}\" | #{CDEC} -c #{ini} -r -k #{k} -w #{w} 2>/dev/null"
   j = -1
   return o.split("\n").map{|i| j+=1; Translation.new(i, j)}
 end
 
-
 # scoring (per-sentence BLEU)
 def ngrams_it(s, n, fix=false)
   a = s.strip.split
@@ -64,70 +61,59 @@ def ngrams_it(s, n, fix=false)
   }
 end
 
-def brevity_penalty h, r
-  a = h.split
-  b = r.split
+def brevity_penalty hypothesis, reference
+  a = hypothesis.split; b = reference.split
   return 1.0 if a.size>b.size
   return Math.exp(1.0 - b.size.to_f/a.size);
 end
 
-def per_sentence_bleu h, r, n=4
-  h_ng = {}
-  r_ng = {}
-  (1).upto(n) { |i| h_ng[i] = []; r_ng[i] = [] }
-  ngrams_it(h, n) { |i| h_ng[i.size] << i }
-  ngrams_it(r, n) { |i| r_ng[i.size] << i }
-  m = [n,r.split.size].min
+def per_sentence_bleu hypothesis, reference, n=4
+  h_ng = {}; r_ng = {}
+  (1).upto(n) {|i| h_ng[i] = []; r_ng[i] = []}
+  ngrams_it(hypothesis, n) {|i| h_ng[i.size] << i}
+  ngrams_it(reference, n) {|i| r_ng[i.size] << i}
+  m = [n, reference.split.size].min
   weight = 1.0/m
   add = 0.0
   sum = 0
   (1).upto(m) { |i|
     counts_clipped = 0
     counts_sum = h_ng[i].size
-    h_ng[i].uniq.each { |j| counts_clipped += r_ng[i].count(j) }
+    h_ng[i].uniq.each {|j| counts_clipped += r_ng[i].count(j)}
     add = 1.0 if i >= 2
     sum += weight * Math.log((counts_clipped + add)/(counts_sum + add));
   }
-  return brevity_penalty(h,r) * Math.exp(sum)
+  return brevity_penalty(hypothesis, reference) * Math.exp(sum)
 end
 
-def score_translations a, reference
-  a.each_with_index { |i,j|
-    i.score = per_sentence_bleu i.s, reference
-  }
+def score_translations list_of_translations, reference
+  list_of_translations.each { |i| i.score = per_sentence_bleu i.s, reference}
 end
-### /scoring
 
-
-
-### hope and fear
-def hope_and_fear a, act='hope'
+# hope and fear
+def hope_and_fear kbest, action
   max = -1.0/0
   max_idx = -1
-  a.each_with_index { |i,j|
-  if act=='hope' && i.model + i.score > max
+  kbest.each_with_index { |i,j|
+  if action=='hope' && i.model + i.score > max
     max_idx = j; max = i.model + i.score
   end
-  if act=='fear' && i.model - i.score > max
+  if action=='fear' && i.model - i.score > max
     max_idx = j; max = i.model - i.score
   end
   }
-  return a[max_idx]
+  return kbest[max_idx]
 end
-### /hope and fear
 
-
-
-### update
-def update w, hope, fear
-  w = w + (hope.f - fear.f)
+# update
+def update w, hope, fear, eta
+  diff = hope.f - fear.f
+  diff *= eta
+  w += diff
   return w
 end
-### /update
-
 
-
-### weights
+# weights
 class NamedSparseVector
   attr_accessor :h
 
@@ -199,41 +185,59 @@ class NamedSparseVector
     @h.to_s
   end
 
+  def length
+    Math.sqrt(@h.values.map{|i|i*i}.inject(:+))
+  end
+
+  def normalize!
+    l = length
+    @h.each_pair { |k,v|
+      @h[k] = v/l
+    }
+  end
+
   def size
     @h.keys.size
   end
 end
-### /weights
 
+# map models score to [0,1]
+def adj_model kbest, factor
+  min = kbest.map{|i|i.model}.min
+  max = kbest.map{|i|i.model}.max
+  kbest.each {|i| i.model = factor*((i.model-min)/(max-min))}
+end
 
-def test opts
-  w = NamedSparseVector.new
-  w.from_file opts[:init_weights]
-  input = File.new(opts[:input], 'r').readlines.map{|i|i.strip}
-  references = File.new(opts[:references], 'r').readlines.map{|i|i.strip}
-  f = File.new('weights.tmp', 'w+')
-  f.write w.to_file
-  f.close
-  kbest = predict_translation input[0], opts[:k], 'weights.tmp'
-  score_translations kbest, references[0]
-  kbest.each_with_index { |i,j|
-    puts "#{i.rank} #{i.s} #{i.model} #{i.score}"
-  }
-  puts
-  puts "hope"
-  hope = hope_and_fear kbest, 'hope'
-  puts "#{hope.rank} #{hope.s} #{hope.model} #{hope.score}"
-  puts "fear"
-  fear = hope_and_fear kbest, 'fear'
-  puts "#{fear.rank} #{fear.s} #{fear.model} #{fear.score}"
+class Stats
+  def initialize name
+    @name = name
+    @with_parse = 0.0
+    @with_output       = 0.0
+    @correct_output    = 0.0
+  end
+
+  def update feedback, func, output
+    @with_parse +=1 if func!="None"
+    @with_output +=1 if output!="null"
+    @correct_output += 1 if feedback==true
+  end
+
+  def print total
+<<-eos
+  [#{@name}]
+         with parse #{((@with_parse/total)*100).round 2} abs:#{@with_parse}
+        with output #{((@with_output/total)*100).round 2} abs:#{@with_output}
+with correct output #{((@correct_output/total)*100).round 2} abs:#{@correct_output}
+eos
+  end
 end
 
-def adj_model a
-  min = a.map{|i|i.model}.min
-  max = a.map{|i|i.model}.max
-  a.each { |i|
-    i.model = (i.model-min)/(max-min)
-  }
+def _print rank, string, model, score
+    puts "rank=#{rank} string='#{string}' model=#{model} score=#{score}"
+end
+
+def bag_of_words s, stopwords=[]
+  s.split.uniq.sort.reject{|v| stopwords.include? v}
 end
 
 def main
@@ -241,83 +245,175 @@ def main
     opt :k, "k", :type => :int, :required => true
     opt :input, "'foreign' input", :type => :string, :required => true
     opt :references, "(parseable) references", :type => :string, :required => true
-    opt :gold, "gold standard parser output", :type => :string, :require => true
-    opt :gold_mrl, "gold standard mrl", :type => :string, :short => '-h', :require => true
+    opt :gold, "gold output", :type => :string, :require => true
+    opt :gold_mrl, "gold parse", :type => :string, :short => '-h', :require => true
     opt :init_weights, "initial weights", :type => :string, :required => true, :short => '-w'
     opt :cdec_ini, "cdec config file", :type => :string, :default => './cdec.ini'
+    opt :eta, "learning rate", :type => :float, :default => 0.01
+    opt :no_update, "don't update weights", :type => :bool, :default => false
+    opt :stop_after, "stop after x examples", :type => :int, :default => -1
+    opt :output_weights, "output file for final weights", :type => :string, :required => true
+    opt :scale_model, "scale model score by this factor", :type => :float, :default => 1.0, :short => '-m'
+    opt :normalize, "normalize weights after each update", :type => :bool, :default => false, :short => '-l'
+    opt :print_kbests, "print full kbest lists", :type => :bool, :default => false, :short => '-j'
+    opt :hope2, "select hope from the first X items in kbest that executes", :type => :int, :default => 0, :short => '-x'
+    opt :fear2, "skip example if fear executes", :type => :bool, :default => false
   end
 
-  input = File.new(opts[:input], 'r').readlines.map{|i|i.strip}
+  puts "cfg"
+  opts.each_pair {|k,v| puts "#{k}\t#{v}"}
+  puts
+
+  input      = File.new(opts[:input], 'r').readlines.map{|i|i.strip}
   references = File.new(opts[:references], 'r').readlines.map{|i|i.strip}
-  gold = File.new(opts[:gold], 'r').readlines.map{|i|i.strip}
-  gold_mrl = File.new(opts[:gold_mrl], 'r').readlines.map{|i|i.strip}
+  gold       = File.new(opts[:gold], 'r').readlines.map{|i|i.strip}
+  gold_mrl   = File.new(opts[:gold_mrl], 'r').readlines.map{|i|i.strip}
+
+  stopwords = File.new('stopwords.en', 'r').readlines.map{|i|i.strip}
 
   # init weights
   w = NamedSparseVector.new
   w.from_file opts[:init_weights]
 
-
-  positive_feedback = 0
   without_translations = 0
-  with_proper_parse = 0
-  with_output = 0
-  count = 0
+  count                = 0
+  top1_stats = Stats.new 'top1'
+  hope_stats = Stats.new 'hope'
+  fear_stats = Stats.new 'fear'
+  refs_stats = Stats.new 'refs'
+  type1_updates = 0
+  type2_updates = 0
+  top1_hit = 0
+  top1_variant = 0
+  top1_real_variant = 0
+  hope_hit = 0
+  hope_variant = 0
+  hope_real_variant = 0
+  kbest_sz = 0
+  last_wf = ''
   input.each_with_index { |i,j|
     count += 1
     # write current weights to file
     tmp_file = Tempfile.new('rampion')
     tmp_file_path = tmp_file.path
+    last_wf = tmp_file.path
     tmp_file.write w.to_file
     tmp_file.close
     # get kbest list for current input
     kbest = predict_translation i, opts[:k], opts[:cdec_ini], tmp_file_path
-    if kbest.size==0 # FIXME: shouldnt happen
+    kbest_sz += kbest.size
+    if kbest.size==0
       without_translations += 1
       next
     end
     score_translations kbest, references[j]
-    adj_model kbest
+    if opts[:print_kbests]
+      puts "KBEST"
+      kbest.each_with_index { |k,l|
+        _print l, k.s, k.model, k.score
+      }
+    end
+    adj_model kbest, opts[:scale_model]
     # get feedback
- 
-    puts "----top1"
-    puts "0 #{kbest[0].s} #{kbest[0].model} #{kbest[0].score}"
+    puts "EXAMPLE #{j}"
+    puts "GOLD MRL: #{gold_mrl[j]}"
+    puts "GOLD OUTPUT #{gold[j]}"
+    # fear
+    fear = hope_and_fear kbest, 'fear'
+    if opts[:fear2]
+      f, g, o = exec fear.s, gold[j], true
+      if f
+        puts "FEAR EXECUTED, skipping example\n\n"
+        next
+      end
+    end
+    # top1
+    puts "---top1"
+    _print 0, kbest[0].s, kbest[0].model, kbest[0].score
     feedback, func, output = exec kbest[0].s, gold[j]
-    with_proper_parse +=1 if func!="None"
-    with_output +=1 if output!="null"
-    positive_feedback  += 1 if feedback==true
-    hope = ''; fear = ''
+    # hope2
+    parses = []
+    if opts[:hope2]>0
+      already_seen = {}
+      puts "<<KBEST EXEC"
+      (1).upto([opts[:hope2]-1, kbest.size-1].min) { |l|
+        f, g, o = exec kbest[l].s, gold[j], true
+        words = bag_of_words kbest[l].s, stopwords
+        parses << f
+        puts "#{f} | #{l} | #{kbest[l].s} #{words.to_s}" if !already_seen.has_key? words
+        already_seen[words] = true
+      }
+      puts ">>>"
+    end
+    top1_stats.update feedback, func, output
+    # hope & update
+    ref_words = bag_of_words references[j], stopwords
+    hope = nil
     if feedback==true
-      puts "'#{kbest[0].s}'"
-      references[j] = kbest[0].s
+      if kbest[0].s == references[j]
+        top1_hit +=1
+      else
+        top1_variant += 1
+        top1_real_variant += 1 if bag_of_words(kbest[0].s,stopwords)!=ref_words
+      end
+      #references[j] = kbest[0].s
       hope = kbest[0]
+      type1_updates += 1
     else
-      hope = hope_and_fear kbest, 'hope'
+      if opts[:hope2]>0
+        c=-1; found = parses.detect{|b| c+=1; b }
+        hope = kbest[c] if found
+        if !found
+          puts "NO GOOD HOPE, skipping example\n\n"
+          next
+        end
+      else
+        hope = hope_and_fear kbest, 'hope'
+      end
+      if hope.s == references[j]
+        hope_hit += 1
+      else
+        hope_variant += 1
+        hope_real_variant += 1 if bag_of_words(hope.s,stopwords)!=ref_words
+      end
+      type2_updates += 1
     end
-    fear = hope_and_fear kbest, 'fear'
-    
-    puts "----hope"
-    puts "#{hope.rank} #{hope.s} #{hope.model} #{hope.score}"
-    exec hope.s, gold[j]
-
-    puts "----fear"
-    puts "#{fear.rank} #{fear.s} #{fear.model} #{fear.score}"
-    exec fear.s, gold[j]
-
-    puts "----reference"
-    puts "// #{references[j]} // 1.0"
-    exec references[j], gold[j]
-    puts "GOLD MRL: #{gold_mrl[j]}"
-    puts "GOLD OUTPUT #{gold[j]}"
+
+    puts "---hope"
+    _print hope.rank, hope.s, hope.model, hope.score
+    feedback, func, output =  exec hope.s, gold[j]
+    hope_stats.update feedback, func, output
+    puts "---fear"
+    _print fear.rank, fear.s, fear.model, fear.score
+    feedback, func, output = exec fear.s, gold[j]
+    fear_stats.update  feedback, func, output
+    puts "---reference"
+    _print 'x', references[j], 'x', 1.0
+    feedback, func, output = exec references[j], gold[j]
+    refs_stats.update feedback, func, output
 
     puts
 
-    w = update w, hope, fear
+    w = update w, hope, fear, opts[:eta] if !opts[:no_update]
+    w.normalize! if opts[:normalize]
+    break if opts[:stop_after]>0 && (j+1)==opts[:stop_after]
   }
+  FileUtils::cp(last_wf, opts[:output_weights])
   puts "#{count} examples"
-  puts "#{((positive_feedback.to_f/count)*100).round 2}% with positive feedback (abs: #{positive_feedback})"
-  puts "#{((with_proper_parse.to_f/count)*100).round 2}% with proper parse (abs: #{with_proper_parse})"
-  puts "#{((with_output.to_f/count)*100).round 2}% with output (abs: #{with_output})"
+  puts "    type1 updates: #{type1_updates}"
+  puts "    type2 updates: #{type2_updates}"
+  puts "        top1 hits: #{top1_hit}"
+  puts "     top1 variant: #{top1_variant}"
+  puts "top1 real variant: #{top1_real_variant}"
+  puts "        hope hits: #{hope_hit}"
+  puts "     hope variant: #{hope_variant}"
+  puts "hope real variant: #{hope_real_variant}"
+  puts "       kbest size: #{(kbest_sz/count).round 2}"
   puts "#{((without_translations.to_f/count)*100).round 2}% without translations (abs: #{without_translations})"
+  puts top1_stats.print count
+  puts hope_stats.print count
+  puts fear_stats.print count
+  puts refs_stats.print count
 end
 
 
-- 
cgit v1.2.3