From 4e3260df76571ee3be531a6d7c0c1b5c93a056a4 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Fri, 25 Apr 2014 15:25:34 +0200 Subject: merge,cleanup,rename --- README.md | 8 +- example/example.sh | 4 +- hopefear.rb | 108 +++-------- lampion.rb | 320 --------------------------------- rebol.rb | 318 ++++++++++++++++++++++++++++++++ scripts/geoquery/test-nof-old-crawl.sh | 5 - scripts/geoquery/test-nof-old.sh | 5 - scripts/geoquery/test-nof.sh | 5 - scripts/geoquery/test.sh | 4 +- 9 files changed, 355 insertions(+), 422 deletions(-) delete mode 100755 lampion.rb create mode 100755 rebol.rb delete mode 100755 scripts/geoquery/test-nof-old-crawl.sh delete mode 100755 scripts/geoquery/test-nof-old.sh delete mode 100755 scripts/geoquery/test-nof.sh diff --git a/README.md b/README.md index 9293207..f43f61d 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,8 @@ -lampion -======= +rebol +===== code for grounded SMT + +This has nothing to do with the programming language REBOL +http://www.rebol.com/ + diff --git a/example/example.sh b/example/example.sh index 1c67ed0..b359dfd 100755 --- a/example/example.sh +++ b/example/example.sh @@ -2,8 +2,8 @@ # memcached has to be running! `memcached -p 31337` -# run lampion with rampion variant for 1 epoch over 10 examples (data.*) -../lampion.rb \ +# run rebol with rampion variant for 1 epoch over 10 examples (data.*) +../rebol.rb \ -k 100 \ -i $(pwd)/data.in \ -r $(pwd)/data.en \ diff --git a/hopefear.rb b/hopefear.rb index 37782a4..aed0c9c 100644 --- a/hopefear.rb +++ b/hopefear.rb @@ -9,114 +9,60 @@ def hope_and_fear kbest, action max_idx = i; max = k.scores[:decoder] - k.scores[:psb] end } - return kbest[max_idx] + return max_idx end -def gethopefear_standard kbest, feedback - hope = fear = nil - type1 = type2 = false - if feedback == true - hope = kbest[0] - type1 = true - else - hope = hope_and_fear kbest, 'hope' - type2 = true - end - fear = hope_and_fear kbest, 'fear' - return hope, fear, false, type1, type2 -end - -def gethopefear_fear_no_exec kbest, feedback, gold, max - hope = fear = nil - type1 = type2 = false - if feedback == true - hope = kbest[0] - type1 = true - else - hope = hope_and_fear kbest, 'hope' - type2 = true - end - # sorted in descending order by max(decoder, psb), best ('hope') first - # select the 'best' translation that does not deliver the correct answer - kbest.sort{ |x,y| (y.scores[:decoder]+y.scores[:psb])<=>(x.scores[:decoder]+x.scores[:psb]) }.each_with_index { |k,i| - break if i==max - if !exec(k.s, gold, true)[0] - fear = k - break - end - } - skip=true if !fear - return hope, fear, skip, type1, type2 -end - -def gethopefear_fear_no_exec_skip kbest, feedback, gold - hope = fear = nil - type1 = type2 = false - if feedback == true - hope = kbest[0] - type1 = true - else - hope = hope_and_fear kbest, 'hope' - type2 = true - end - fear = hope_and_fear(kbest, 'fear') - # skip example if fear gives the right answer - skip = exec(fear.s, gold, true)[0] - return hope, fear, skip, type1, type2 -end - -def gethopefear_fear_no_exec_hope_exec kbest, feedback, gold, max - hope = fear = nil; hope_idx = 0 +def gethopefear_rebol kbest, feedback, gold, max, own_reference=nil + hope = fear = nil; new_reference = nil type1 = type2 = false - # sorted in descending order by max(decoder, psb), best ('hope') first - sorted_kbest = kbest.sort{ |x,y| (y.scores[:decoder]+y.scores[:psb])<=>(x.scores[:decoder]+x.scores[:psb]) } if feedback == true + # hope hope = kbest[0] + new_reference = hope + kbest.each { |k| k.scores[:psb] = BLEU::per_sentence_bleu k.s, new_reference } + # fear + kbest.sort_by { |k| -(k.scores[:model]-k.score[:psb]) }.each_with_index { |k,i| + break if i==max + if !exec(k.s, gold, true)[0] + fear = k + break + end + } type1 = true else - # select 'best' translation that correctly executes - sorted_kbest.each_with_index { |k,i| - next if i==0 + # fear + fear = kbest[0] + # hope + kbest.sort_by { |k| -(k.scores[:model]+k.score[:psb]) }.each_with_index { |k,i| break if i==max if exec(k.s, gold, true)[0] - hope_idx = i hope = k break end } type2 = true end - # select 'best' translation that does not correctly execute - sorted_kbest.each_with_index { |k,i| - break if i>(kbest.size-(hope_idx+1))||i==max - if !exec(k.s, gold, true)[0] - fear = k - break - end - } - # skip if hope or fear could no be found skip = true if !hope||!fear - return hope, fear, skip, type1, type2 + return hope, fear, skip, type1, type2, new_reference end -def gethopefear_fear_no_exec_hope_exec_skip kbest, feedback, gold, max +def gethopefear_rebol_light kbest, feedback, gold hope = fear = nil type1 = type2 = false if feedback == true hope = kbest[0] type1 = true else - hope = hope_and_fear kbest, 'hope' + hope = kbest[hope_and_fear kbest, 'hope'] type2 = true end - fear = hope_and_fear kbest, 'fear' - # skip if fear executes correctly or hope doesn't - skip = exec(fear.s, gold, true)[0]||!exec(hope.s, gold, true)[0] + fear = kbest[hope_and_fear kbest, 'fear'] + # skip example if fear gives the right answer + skip = exec(fear.s, gold, true)[0] return hope, fear, skip, type1, type2 end -# new variant w/ "real" reference -def gethopefear_only_exec kbest, feedback, gold, max, own_reference=nil +def gethopefear_exec kbest, feedback, gold, max, own_reference=nil hope = fear = nil; hope_idx = 0; new_reference = nil type1 = type2 = false if feedback == true @@ -158,10 +104,10 @@ def gethopefear_rampion kbest, reference # 1best is automatically hope if it matches reference if kbest[0].s == reference hope = kbest[0] - fear = hope_and_fear kbest, 'fear' + fear = kbest[hope_and_fear kbest, 'fear'] type1 = true else - hope = hope_and_fear kbest, 'hope' + hope = kbest[hope_and_fear kbest, 'hope'] # 1best is automatically fear if it doesn't match reference fear = kbest[0] type2 = true diff --git a/lampion.rb b/lampion.rb deleted file mode 100755 index 90db3c9..0000000 --- a/lampion.rb +++ /dev/null @@ -1,320 +0,0 @@ -#!/usr/bin/env ruby - -require 'nlp_ruby' -require 'trollop' -require 'tempfile' -require 'memcached' -require 'digest' -require_relative './hopefear' - - -def exec natural_language_string, reference_output, no_output=false - mrl = output = feedback = nil - # this may cause collisions, but there are not so many German words that - # could have different Umlauts at the same position, e.g. Häuser => H?user - key_prefix = Digest::SHA1.hexdigest(natural_language_string.encode('ASCII', :invalid => :replace, :undef => :replace, :replace => '?').gsub(/ /,'_')) - begin - mrl = $cache.get key_prefix+'__MRL' - output = $cache.get key_prefix+'__OUTPUT' - feedback = $cache.get key_prefix+'__FEEDBACK' - rescue Memcached::NotFound - mrl_cmd = "#{SMT_SEMPARSE} \"#{natural_language_string.gsub('"', ' ')}\"" - # beware: EVAL_PL sometimes hangs and can't be killed! - mrl = spawn_with_timeout(mrl_cmd, TIMEOUT, ACCEPT_ZOMBIES).strip - output = spawn_with_timeout("echo \"execute_funql_query(#{mrl}, X).\" | swipl -s #{EVAL_PL} 2>&1 | grep \"X =\"", TIMEOUT).strip.split('X = ')[1] - feedback = output==reference_output - begin - $cache.set key_prefix+'__MRL', mrl - $cache.set key_prefix+'__OUTPUT', output - $cache.set key_prefix+'__FEEDBACK', feedback - rescue SystemExit, Interrupt - $cache.delete key_prefix+'__MRL' - $cache.delete key_prefix+'__OUTPUT' - $cache.delete key_prefix+'__FEEDBACK"' - end - end - STDERR.write " nrl: #{natural_language_string}\n" if !no_output - STDERR.write " mrl: #{mrl}\n" if !no_output - STDERR.write " output: #{output}\n" if !no_output - STDERR.write " correct?: #{feedback}\n" if !no_output - return feedback, mrl, output -end - -class Stats - - def initialize name - @name = name - @with_parse = 0.0 - @with_output = 0.0 - @with_correct_output = 0.0 - end - - def update feedback, mrl, output - @with_parse += 1 if mrl!='' - @with_output += 1 if output!='' - @with_correct_output += 1 if feedback==true - end - - def to_s total - without_parse = total-@with_parse -<<-eos - #{@name} with parse #{((@with_parse/total)*100).round 2}% abs=#{@with_parse} - #{@name} with output #{((@with_output/total)*100).round 2}% abs=#{@with_output} -#{@name} with correct output #{((@with_correct_output/total)*100).round 2}% adj=#{((@with_correct_output/(total-without_parse))*100).round 2} abs=#{@with_correct_output} -eos - end -end - -def adjust_model_scores kbest, factor - min = kbest.map{ |k| k.scores[:decoder] }.min - max = kbest.map{ |k| k.scores[:decoder] }.max - return if min==0&&max==0 - kbest.each { |k| k.scores[:decoder_orig] = k.scores[:decoder]; k.scores[:decoder] = factor*((k.scores[:decoder]-min)/(max-min)) } -end - -def main - cfg = Trollop::options do - # [data] - opt :k, "k", :type => :int, :default => 100, :short => '-k' - opt :input, "'foreign' input", :type => :string, :required => true, :short => '-i' - opt :references, "(parseable) references", :type => :string, :required => true, :short => '-r' - opt :gold, "gold output", :type => :string, :required => true, :short => '-g' - # just for debugging: - opt :gold_mrl, "gold parse", :type => :string, :required => true, :short => '-h' - opt :init_weights, "initial weights", :type => :string, :required => true, :short => '-w' - opt :global_vars, "semantic parser, cdec bin, eval.pl", :type => :string, :required => true, :short => '-b' - opt :cdec_ini, "cdec config file", :type => :string, :required => true, :short => '-c' - # just used for 1best/hope variant detection - opt :stopwords_file, "stopwords file", :type => :string, :default => 'd/stopwords.en', :short => '-t' - # [output] - opt :output_weights, "output file for final weights", :type => :string, :required => true, :short => '-o' - opt :debug, "debug output", :type => :bool, :default => false, :short => '-d' - opt :print_kbest, "print full kbest lists", :type => :bool, :default => false, :short => '-l' - # [learning parameters] - opt :eta, "learning rate", :type => :float, :default => 0.01, :short => '-e' - opt :iterate, "iteration X epochs", :type => :int, :default => 1, :short => '-j' - opt :stop_after, "stop after x examples", :type => :int, :default => -1, :short => '-s' - opt :scale_model, "scale model scores by this factor", :type => :float, :default => 1.0, :short => '-m' - opt :normalize, "normalize weights after each update", :type => :bool, :default => false, :short => '-n' - # don't use when 'bad' examples are filtered: - opt :skip_on_no_proper_gold, "skip, if the reference didn't produce a proper gold output", :type => :bool, :default => false, :short => '-x' - opt :no_update, "don't update weights", :type => :bool, :default => false, :short => '-y' - # don't use: - opt :hope_fear_max, "# entries to consider when searching good hope/fear", :type => :int, :default => 10**10, :short => '-q' - # see hopefear.rb: - opt :variant, "standard, rampion, fear_no_exec, fear_no_exec_skip, fear_no_exec_hope_exec, fear_no_exec_hope_exec_skip, only_exec", :default => 'standard', :short => '-v' - end - - require_relative cfg[:global_vars] - STDERR.write "CONFIGURATION\n" - cfg.each_pair { |k,v| STDERR.write " #{k}=#{v}\n" } - STDERR.write "SMT_SEMPARSE=#{SMT_SEMPARSE}\n" - STDERR.write "EVAL_PL=#{EVAL_PL}\n" - STDERR.write "CDEC_BIN=#{CDEC_BIN}\n\n" - - # read data - input = ReadFile.readlines_strip cfg[:input] - references = ReadFile.readlines_strip cfg[:references] - gold = ReadFile.readlines_strip cfg[:gold] - gold_mrl = ReadFile.readlines_strip cfg[:gold_mrl] - stopwords = ReadFile.readlines_strip cfg[:stopwords_file] - - # only for 'only_exec' variant - own_references = nil - own_references = references.map{ |i| nil } if cfg[:variant]=='only_exec' - - # initialize model - w = SparseVector.from_file cfg[:init_weights], ' ' - last_weights_fn = '' - - # iterations loop - cfg[:iterate].times { |iter| - - # (reset) numerous counters - count = 0 - without_translation = 0 - no_proper_gold_output = 0 - top1_stats = Stats.new 'top1' - hope_stats = Stats.new 'hope' - fear_stats = Stats.new 'fear' - type1_updates = 0 - type2_updates = 0 - top1_hit = 0 - top1_variant = 0 - top1_true_variant = 0 - hope_hit = 0 - hope_variant = 0 - hope_true_variant = 0 - kbest_sz = 0 - - # input loop - input.each_with_index { |i,j| - break if cfg[:stop_after]>0&&count==cfg[:stop_after] - count += 1 - - # write weights to file for cdec - tmp_file = Tempfile.new('rampion') - tmp_file_path = tmp_file.path - last_weights_fn = tmp_file.path - tmp_file.write w.to_kv ' ', "\n" - tmp_file.close - - # get kbest list - kbest = cdec_kbest CDEC_BIN, i, cfg[:cdec_ini], tmp_file_path, cfg[:k] - kbest_sz += kbest.size - - STDERR.write "\n=================\n" - STDERR.write " EXAMPLE: #{j}\n" - STDERR.write " REFERENCE: #{references[j]}\n" - STDERR.write " GOLD MRL: #{gold_mrl[j]}\n" - STDERR.write "GOLD OUTPUT: #{gold[j]}\n" - - # translation failed - if kbest.size == 0 - without_translation += 1 - STDERR.write "NO MT OUTPUT, skipping example\n" - next - end - - # don't use when data is filtered - if gold[j] == '[]' || gold[j] == '[...]' || gold[j] == '[].' || gold[j] == '[...].' - no_proper_gold_output += 1 - if cfg[:skip_on_no_proper_gold] - STDERR.write "NO PROPER GOLD OUTPUT, skipping example\n" - next - end - end - - # get per-sentence BLEU scores - kbest.each { |k| k.scores[:psb] = BLEU::per_sentence_bleu k.s, references[j] } - - # map decoder scores to [0,1] - adjust_model_scores kbest, cfg[:scale_model] - - if cfg[:print_kbest] - STDERR.write "\n<<< KBEST\n" - kbest.each_with_index { |k,l| STDERR.write k.to_s2+"\n" } - STDERR.write ">>>\n" - end - - # informative output - STDERR.write "\n [TOP1]\n" - # print 1best on last iteration - puts "#{kbest[0].s}" if iter+1==cfg[:iterate] - - # execute 1best - feedback, mrl, output = exec kbest[0].s, gold[j] - STDERR.write " SCORES: #{kbest[0].scores.to_s}\n" - top1_stats.update feedback, mrl, output - - # hope/fear variants - hope = fear = new_reference = nil - type1 = type2 = skip = false - case cfg[:variant] - when 'standard' - hope, fear, skip, type1, type2 = gethopefear_standard kbest, feedback - when 'rampion' - hope, fear, skip, type1, type2 = gethopefear_rampion kbest, references[j] - when 'fear_no_exec_skip' - hope, fear, skip, type1, type2 = gethopefear_fear_no_exec_skip kbest, feedback, gold[j] - when 'fear_no_exec' - hope, fear, skip, type1, type2 = gethopefear_fear_no_exec kbest, feedback, gold[j], cfg[:hope_fear_max] - when 'fear_no_exec_hope_exec' - hope, fear, skip, type1, type2 = gethopefear_fear_no_exec_hope_exec kbest, feedback, gold[j], cfg[:hope_fear_max] - when 'fear_no_exec_hope_exec_skip' - hope, fear, skip, type1, type2 = gethopefear_fear_no_exec_hope_exec_skip kbest, feedback, gold[j], cfg[:hope_fear_max] - when 'only_exec' - hope, fear, skip, type1, type2, new_reference = gethopefear_only_exec kbest, feedback, gold[j], cfg[:hope_fear_max], own_references[j] - else - STDERR.write "NO SUCH VARIANT, exiting.\n" - exit 1 - end - - # for 'only_exec' variant - if new_reference - own_references[j] = new_reference - end - - type1_updates+=1 if type1 - type2_updates+=1 if type2 - - # for string variant detection - ref_words = bag_of_words references[j], stopwords - - if kbest[0].s == references[j] - top1_hit += 1 - elsif feedback - top1_variant += 1 - top1_true_variant += 1 if !bag_of_words(kbest[0].s, stopwords).is_subset_of?(ref_words) - end - - # hope output & statistics - STDERR.write "\n [HOPE]\n" - if hope - feedback, mrl, output = exec hope.s, gold[j] - STDERR.write " SCORES: #{hope.scores.to_s}, ##{hope.rank}\n" - hope_stats.update feedback, mrl, output - if hope.s==references[j] - hope_hit += 1 - elsif feedback - hope_variant += 1 - hope_true_variant += 1 if !bag_of_words(hope.s, stopwords).is_subset_of?(ref_words) - end - end - - # fear output & statistics - STDERR.write "\n [FEAR]\n" - if fear - feedback, mrl, output = exec fear.s, gold[j] - STDERR.write " SCORES: #{fear.scores.to_s}, ##{fear.rank}\n" - fear_stats.update feedback, mrl, output - end - - # skip if needed - if skip || !hope || !fear - STDERR.write "NO GOOD HOPE/FEAR, skipping example\n\n" - next - end - - # update - w += (hope.f - fear.f) * cfg[:eta] if !cfg[:no_update] - - # normalize model - w.normalize! if cfg[:normalize] - } - - # save all weights - if cfg[:iterate] > 1 - WriteFile.write ReadFile.read(last_weights_fn), "#{cfg[:output_weights]}.#{iter}.gz" - else - FileUtils::cp(last_weights_fn, cfg[:output_weights]) - end - - STDERR.write <<-eos - ---- - iteration ##{iter+1}/#{cfg[:iterate]}: #{count} examples - type1 updates: #{type1_updates} - type2 updates: #{type2_updates} - top1 hits: #{top1_hit} - top1 variant: #{top1_variant} - top1 true variant: #{top1_true_variant} - hope hits: #{hope_hit} - hope variant: #{hope_variant} - hope true variant: #{hope_true_variant} - kbest size: #{(kbest_sz/count).round 2} - #{((without_translation.to_f/count)*100).round 2}% without translations (abs: #{without_translation}) - #{((no_proper_gold_output.to_f/count)*100).round 2}% no good gold output (abs: #{no_proper_gold_output}) - -#{top1_stats.to_s count} -#{hope_stats.to_s count} -#{fear_stats.to_s count} - -eos - - } -end - - -main - diff --git a/rebol.rb b/rebol.rb new file mode 100755 index 0000000..3c54a3c --- /dev/null +++ b/rebol.rb @@ -0,0 +1,318 @@ +#!/usr/bin/env ruby + +require 'nlp_ruby' +require 'trollop' +require 'tempfile' +require 'memcached' +require 'digest' +require_relative './hopefear' + + +def exec natural_language_string, reference_output, no_output=false + mrl = output = feedback = nil + # this may cause collisions, but there are not so many German words that + # could have different Umlauts at the same position, e.g. Häuser => H?user + key_prefix = Digest::SHA1.hexdigest(natural_language_string.encode('ASCII', :invalid => :replace, :undef => :replace, :replace => '?').gsub(/ /,'_')) + begin + mrl = $cache.get key_prefix+'__MRL' + output = $cache.get key_prefix+'__OUTPUT' + feedback = $cache.get key_prefix+'__FEEDBACK' + rescue Memcached::NotFound + mrl_cmd = "#{SMT_SEMPARSE} \"#{natural_language_string.gsub('"', ' ')}\"" + # beware: EVAL_PL sometimes hangs and can't be killed! + mrl = spawn_with_timeout(mrl_cmd, TIMEOUT, ACCEPT_ZOMBIES).strip + output = spawn_with_timeout("echo \"execute_funql_query(#{mrl}, X).\" | swipl -s #{EVAL_PL} 2>&1 | grep \"X =\"", TIMEOUT).strip.split('X = ')[1] + feedback = output==reference_output + begin + $cache.set key_prefix+'__MRL', mrl + $cache.set key_prefix+'__OUTPUT', output + $cache.set key_prefix+'__FEEDBACK', feedback + rescue SystemExit, Interrupt + $cache.delete key_prefix+'__MRL' + $cache.delete key_prefix+'__OUTPUT' + $cache.delete key_prefix+'__FEEDBACK"' + end + end + STDERR.write " nrl: #{natural_language_string}\n" if !no_output + STDERR.write " mrl: #{mrl}\n" if !no_output + STDERR.write " output: #{output}\n" if !no_output + STDERR.write " correct?: #{feedback}\n" if !no_output + return feedback, mrl, output +end + +class Stats + + def initialize name + @name = name + @with_parse = 0.0 + @with_output = 0.0 + @with_correct_output = 0.0 + end + + def update feedback, mrl, output + @with_parse += 1 if mrl!='' + @with_output += 1 if output!='' + @with_correct_output += 1 if feedback==true + end + + def to_s total + without_parse = total-@with_parse +<<-eos + #{@name} with parse #{((@with_parse/total)*100).round 2}% abs=#{@with_parse} + #{@name} with output #{((@with_output/total)*100).round 2}% abs=#{@with_output} +#{@name} with correct output #{((@with_correct_output/total)*100).round 2}% adj=#{((@with_correct_output/(total-without_parse))*100).round 2} abs=#{@with_correct_output} +eos + end +end + +def adjust_model_scores kbest, factor + min = kbest.map{ |k| k.scores[:decoder] }.min + max = kbest.map{ |k| k.scores[:decoder] }.max + return if min==0&&max==0 + kbest.each { |k| k.scores[:decoder_orig] = k.scores[:decoder]; k.scores[:decoder] = factor*((k.scores[:decoder]-min)/(max-min)) } +end + +def main + cfg = Trollop::options do + # [data] + opt :k, "k", :type => :int, :default => 100, :short => '-k' + opt :input, "'foreign' input", :type => :string, :required => true, :short => '-i' + opt :references, "(parseable) references", :type => :string, :required => true, :short => '-r' + opt :gold, "gold output", :type => :string, :required => true, :short => '-g' + # just for debugging: + opt :gold_mrl, "gold parse", :type => :string, :required => true, :short => '-h' + opt :init_weights, "initial weights", :type => :string, :required => true, :short => '-w' + opt :global_vars, "semantic parser, cdec bin, eval.pl", :type => :string, :required => true, :short => '-b' + opt :cdec_ini, "cdec config file", :type => :string, :required => true, :short => '-c' + # just used for 1best/hope variant detection + opt :stopwords_file, "stopwords file", :type => :string, :default => 'd/stopwords.en', :short => '-t' + # [output] + opt :output_weights, "output file for final weights", :type => :string, :required => true, :short => '-o' + opt :debug, "debug output", :type => :bool, :default => false, :short => '-d' + opt :print_kbest, "print full kbest lists", :type => :bool, :default => false, :short => '-l' + # [learning parameters] + opt :eta, "learning rate", :type => :float, :default => 0.01, :short => '-e' + opt :iterate, "iteration X epochs", :type => :int, :default => 1, :short => '-j' + opt :stop_after, "stop after x examples", :type => :int, :default => -1, :short => '-s' + opt :scale_model, "scale model scores by this factor", :type => :float, :default => 1.0, :short => '-m' + opt :normalize, "normalize weights after each update", :type => :bool, :default => false, :short => '-n' + # don't use when 'bad' examples are filtered: + opt :skip_on_no_proper_gold, "skip, if the reference didn't produce a proper gold output", :type => :bool, :default => false, :short => '-x' + opt :no_update, "don't update weights", :type => :bool, :default => false, :short => '-y' + # don't use: + opt :hope_fear_max, "# entries to consider when searching good hope/fear", :type => :int, :default => 10**10, :short => '-q' + # see hopefear.rb: + opt :variant, "rampion, rebol, rebol_light, exec", :type => :string, :default => 'rampion', :short => '-v' + end + + require_relative cfg[:global_vars] + STDERR.write "CONFIGURATION\n" + cfg.each_pair { |k,v| STDERR.write " #{k}=#{v}\n" } + STDERR.write "SMT_SEMPARSE=#{SMT_SEMPARSE}\n" + STDERR.write "EVAL_PL=#{EVAL_PL}\n" + STDERR.write "CDEC_BIN=#{CDEC_BIN}\n\n" + + # read data + input = ReadFile.readlines_strip cfg[:input] + references = ReadFile.readlines_strip cfg[:references] + gold = ReadFile.readlines_strip cfg[:gold] + gold_mrl = ReadFile.readlines_strip cfg[:gold_mrl] + stopwords = ReadFile.readlines_strip cfg[:stopwords_file] + + own_references = nil + own_references = references.map{ |i| nil } + + # initialize model + w = SparseVector.from_file cfg[:init_weights], ' ' + last_weights_fn = '' + + # iterations loop + cfg[:iterate].times { |iter| + + # (reset) numerous counters + count = 0 + without_translation = 0 + no_proper_gold_output = 0 + top1_stats = Stats.new 'top1' + hope_stats = Stats.new 'hope' + fear_stats = Stats.new 'fear' + type1_updates = 0 + type2_updates = 0 + top1_hit = 0 + top1_variant = 0 + top1_true_variant = 0 + hope_hit = 0 + hope_variant = 0 + hope_true_variant = 0 + kbest_sz = 0 + + # input loop + input.each_with_index { |i,j| + break if cfg[:stop_after]>0&&count==cfg[:stop_after] + count += 1 + + # write weights to file for cdec + tmp_file = Tempfile.new('rampion') + tmp_file_path = tmp_file.path + last_weights_fn = tmp_file.path + tmp_file.write w.to_kv ' ', "\n" + tmp_file.close + + # get kbest list + kbest = cdec_kbest CDEC_BIN, i, cfg[:cdec_ini], tmp_file_path, cfg[:k] + kbest_sz += kbest.size + + STDERR.write "\n=================\n" + STDERR.write " EXAMPLE: #{j}\n" + STDERR.write " REFERENCE: #{references[j]}\n" + STDERR.write " GOLD MRL: #{gold_mrl[j]}\n" + STDERR.write "GOLD OUTPUT: #{gold[j]}\n" + + # translation failed + if kbest.size == 0 + without_translation += 1 + STDERR.write "NO MT OUTPUT, skipping example\n" + next + end + + # don't use when data is filtered + if gold[j] == '[]' || gold[j] == '[...]' || gold[j] == '[].' || gold[j] == '[...].' + no_proper_gold_output += 1 + if cfg[:skip_on_no_proper_gold] + STDERR.write "NO PROPER GOLD OUTPUT, skipping example\n" + next + end + end + + # get per-sentence BLEU scores + kbest.each { |k| k.scores[:psb] = BLEU::per_sentence_bleu k.s, references[j] } + + # map decoder scores to [0,1] + adjust_model_scores kbest, cfg[:scale_model] + + if cfg[:print_kbest] + STDERR.write "\n<<< KBEST\n" + kbest.each_with_index { |k,l| STDERR.write k.to_s2+"\n" } + STDERR.write ">>>\n" + end + + # informative output + STDERR.write "\n [TOP1]\n" + # print 1best on last iteration + puts "#{kbest[0].s}" if iter+1==cfg[:iterate] + + # execute 1best + feedback, mrl, output = exec kbest[0].s, gold[j] + STDERR.write " SCORES: #{kbest[0].scores.to_s}\n" + top1_stats.update feedback, mrl, output + + # hope/fear variants + hope = fear = new_reference = nil + type1 = type2 = skip = false + case cfg[:variant] + when 'rampion' + hope, fear, skip, type1, type2 = gethopefear_rampion kbest, references[j] + when 'rebol' + hope, fear, skip, type1, type2, new_reference = gethopefear_rebol kbest, feedback, gold[j], cfg[:hope_fear_max], own_references[j] + when 'rebol_light' + hope, fear, skip, type1, type2 = gethopefear_rebol_light kbest, feedback, gold[j] + when 'only_exec' + hope, fear, skip, type1, type2, new_reference = gethopefear_exec kbest, feedback, gold[j], cfg[:hope_fear_max], own_references[j] + else + STDERR.write "NO SUCH VARIANT, exiting.\n" + exit 1 + end + + if new_reference + own_references[j] = new_reference.s + end + + type1_updates+=1 if type1 + type2_updates+=1 if type2 + + # for string variant detection + ref_words = bag_of_words references[j], stopwords + + if kbest[0].s == references[j] + top1_hit += 1 + elsif feedback + top1_variant += 1 + top1_true_variant += 1 if !bag_of_words(kbest[0].s, stopwords).is_subset_of?(ref_words) + end + + # hope output & statistics + STDERR.write "\n [HOPE]\n" + if hope + feedback, mrl, output = exec hope.s, gold[j] + STDERR.write " SCORES: #{hope.scores.to_s}, ##{hope.rank}\n" + hope_stats.update feedback, mrl, output + if hope.s==references[j] + hope_hit += 1 + elsif feedback + hope_variant += 1 + hope_true_variant += 1 if !bag_of_words(hope.s, stopwords).is_subset_of?(ref_words) + end + end + + # fear output & statistics + STDERR.write "\n [FEAR]\n" + if fear + feedback, mrl, output = exec fear.s, gold[j] + STDERR.write " SCORES: #{fear.scores.to_s}, ##{fear.rank}\n" + fear_stats.update feedback, mrl, output + end + + # skip if needed + if skip || !hope || !fear + STDERR.write "NO GOOD HOPE/FEAR, skipping example\n\n" + next + end + + # update + w += (hope.f - fear.f) * cfg[:eta] if !cfg[:no_update] + + # normalize model + w.normalize! if cfg[:normalize] + } + + # save all weights + if cfg[:iterate] > 1 + WriteFile.write ReadFile.read(last_weights_fn), "#{cfg[:output_weights]}.#{iter}.gz" + else + FileUtils::cp(last_weights_fn, cfg[:output_weights]) + end + + STDERR.write <<-eos + +--- + iteration ##{iter+1}/#{cfg[:iterate]}: #{count} examples + type1 updates: #{type1_updates} + type2 updates: #{type2_updates} + top1 hits: #{top1_hit} + top1 variant: #{top1_variant} + top1 true variant: #{top1_true_variant} + hope hits: #{hope_hit} + hope variant: #{hope_variant} + hope true variant: #{hope_true_variant} + kbest size: #{(kbest_sz/count).round 2} + #{((without_translation.to_f/count)*100).round 2}% without translations (abs: #{without_translation}) + #{((no_proper_gold_output.to_f/count)*100).round 2}% no good gold output (abs: #{no_proper_gold_output}) + +#{top1_stats.to_s count} +#{hope_stats.to_s count} +#{fear_stats.to_s count} + +eos + + STDERR.write "<<< #{own_references.size} OWN REFERENCES" + own_references.each_with_index { |i,j| + STDERR.write "#{j} '#{i}'" if i + } + STDERR.write ">>>" + + } +end + + +main + diff --git a/scripts/geoquery/test-nof-old-crawl.sh b/scripts/geoquery/test-nof-old-crawl.sh deleted file mode 100755 index 79c35a8..0000000 --- a/scripts/geoquery/test-nof-old-crawl.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/sh - -/workspace/grounded/lampion/scripts/geoquery/translate.sh $1 $2 < /workspace/grounded/lampion/proper/d/split880.test-nof-old-crawl.in | tee $2.transl | /workspace/grounded/lampion/scripts/geoquery/semparse.rb $3 | tee $2.parsed | /workspace/grounded/lampion/scripts/geoquery/query.rb $3 > $2.output -/workspace/grounded/lampion/scripts/geoquery/eval.rb /workspace/grounded/lampion/proper/d/split880.test-nof.gold < $2.output > $2.result - diff --git a/scripts/geoquery/test-nof-old.sh b/scripts/geoquery/test-nof-old.sh deleted file mode 100755 index 99a8241..0000000 --- a/scripts/geoquery/test-nof-old.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/sh - -/workspace/grounded/lampion/scripts/geoquery/translate.sh $1 $2 < /workspace/grounded/lampion/proper/d/split880.test-nof-old.in | tee $2.transl | /workspace/grounded/lampion/scripts/geoquery/semparse.rb $3 | tee $2.parsed | /workspace/grounded/lampion/scripts/geoquery/query.rb $3 > $2.output -/workspace/grounded/lampion/scripts/geoquery/eval.rb /workspace/grounded/lampion/proper/d/split880.test-nof.gold < $2.output > $2.result - diff --git a/scripts/geoquery/test-nof.sh b/scripts/geoquery/test-nof.sh deleted file mode 100755 index 786afc2..0000000 --- a/scripts/geoquery/test-nof.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/sh - -/workspace/grounded/lampion/scripts/geoquery/translate.sh $1 $2 < /workspace/grounded/lampion/proper/d/split880.test-nof.in | tee $2.transl | /workspace/grounded/lampion/scripts/geoquery/semparse.rb $3 | tee $2.parsed | /workspace/grounded/lampion/scripts/geoquery/query.rb $3 > $2.output -/workspace/grounded/lampion/scripts/geoquery/eval.rb /workspace/grounded/lampion/proper/d/split880.test-nof.gold < $2.output > $2.result - diff --git a/scripts/geoquery/test.sh b/scripts/geoquery/test.sh index 3ac8b2d..3dea047 100755 --- a/scripts/geoquery/test.sh +++ b/scripts/geoquery/test.sh @@ -1,5 +1,5 @@ #!/bin/sh -/workspace/grounded/lampion/scripts/geoquery/translate.sh $1 $2 < /workspace/grounded/lampion/proper/d/split880.test.in | tee $2.transl | /workspace/grounded/lampion/scripts/geoquery/semparse.rb $3 | tee $2.parsed | /workspace/grounded/lampion/scripts/geoquery/query.rb $3 > $2.output -/workspace/grounded/lampion/scripts/geoquery/eval.rb /workspace/grounded/lampion/proper/d/split880.test.gold < $2.output > $2.result +/workspace/grounded/rebol/scripts/geoquery/translate.sh $1 $2 < /workspace/grounded/rebol/proper/d/split880.test.in | tee $2.transl | /workspace/grounded/rebol/scripts/geoquery/semparse.rb $3 | tee $2.parsed | /workspace/grounded/rebol/scripts/geoquery/query.rb $3 > $2.output +/workspace/grounded/rebol/scripts/geoquery/eval.rb /workspace/grounded/rebol/proper/d/split880.test.gold < $2.output > $2.result -- cgit v1.2.3