From 6fc9c7ce2171687ac3319973d1af02904b06b790 Mon Sep 17 00:00:00 2001 From: Patrick Simianer
Date: Wed, 26 Feb 2014 18:17:11 +0100 Subject: smarter BLEU --- lib/nlp_ruby/bleu.rb | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) (limited to 'lib') diff --git a/lib/nlp_ruby/bleu.rb b/lib/nlp_ruby/bleu.rb index d7a6b2b..56f341b 100644 --- a/lib/nlp_ruby/bleu.rb +++ b/lib/nlp_ruby/bleu.rb @@ -79,49 +79,50 @@ def BLEU::get_counts hypothesis, reference, n, times=1 return p end -def BLEU::brevity_penalty c, r, hack=0.0 - return 1.0 if c>r - return Math.exp 1.0-((r+hack)/c) +def BLEU::brevity_penalty c, r, smooth=0.0 + return [0.0, 1.0-((r+smooth)/c)].min end def BLEU::bleu counts, n, debug=false corpus_stats = NgramCounts.new n counts.each { |i| corpus_stats.plus_eq i } - sum = 0.0 - w = 1.0/n + logbleu = 0.0 0.upto(n-1) { |m| STDERR.write "#{m+1} #{corpus_stats.clipped[m]} / #{corpus_stats.sum[m]}\n" if debug return 0.0 if corpus_stats.clipped[m] == 0 or corpus_stats.sum == 0 - sum += w * Math.log(corpus_stats.clipped[m] / corpus_stats.sum[m]) + logbleu += Math.log(corpus_stats.clipped[m]) - Math.log(corpus_stats.sum[m]) } + logbleu /= n if debug STDERR.write "BP #{brevity_penalty(corpus_stats.hyp_len, corpus_stats.ref_len)}\n" STDERR.write "sum #{Math.exp(sum)}\n" end - return brevity_penalty(corpus_stats.hyp_len, corpus_stats.ref_len) * Math.exp(sum) + logbleu += brevity_penalty corpus_stats.hyp_len, corpus_stats.ref_len + return Math.exp logbleu end def BLEU::hbleu counts, n, debug=false (100*bleu(counts, n, debug)).round(3) end -def BLEU::per_sentence_bleu hypothesis, reference, n=4, hack=0.0 +def BLEU::per_sentence_bleu hypothesis, reference, n=4, smooth=0.0 h_ng = {}; r_ng = {} - (1).upto(n) {|i| h_ng[i] = []; r_ng[i] = []} - ngrams(hypothesis, n) {|i| h_ng[i.size] << i} - ngrams(reference, n) {|i| r_ng[i.size] << i} + (1).upto(n) { |i| h_ng[i] = []; r_ng[i] = [] } + ngrams(hypothesis, n) { |i| h_ng[i.size] << i } + ngrams(reference, n) { |i| r_ng[i.size] << i } m = [n, reference.split.size].min - weight = 1.0/m add = 0.0 - sum = 0 + logbleu = 0.0 (1).upto(m) { |i| counts_clipped = 0 counts_sum = h_ng[i].size h_ng[i].uniq.each { |j| counts_clipped += r_ng[i].count(j) } add = 1.0 if i >= 2 - sum += weight * Math.log((counts_clipped + add)/(counts_sum + add)); + logbleu += Math.log(counts_clipped+add) - Math.log(counts_sum+add); } - return brevity_penalty(hypothesis.strip.split.size, reference.strip.split.size) * Math.exp(sum) + logbleu /= m + logbleu += brevity_penalty hypothesis.strip.split.size, reference.strip.split.size, smooth + return Math.exp logbleu end -- cgit v1.2.3