summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2014-02-26 18:17:11 +0100
committerPatrick Simianer <p@simianer.de>2014-02-26 18:17:11 +0100
commit6fc9c7ce2171687ac3319973d1af02904b06b790 (patch)
tree8d54b7dc68fb548a4569831c16175ea9d757c41b
parent47fe94c4addff8a3719998a32a0423662e9eccd2 (diff)
smarter BLEU
-rw-r--r--lib/nlp_ruby/bleu.rb31
1 files changed, 16 insertions, 15 deletions
diff --git a/lib/nlp_ruby/bleu.rb b/lib/nlp_ruby/bleu.rb
index d7a6b2b..56f341b 100644
--- a/lib/nlp_ruby/bleu.rb
+++ b/lib/nlp_ruby/bleu.rb
@@ -79,49 +79,50 @@ def BLEU::get_counts hypothesis, reference, n, times=1
return p
end
-def BLEU::brevity_penalty c, r, hack=0.0
- return 1.0 if c>r
- return Math.exp 1.0-((r+hack)/c)
+def BLEU::brevity_penalty c, r, smooth=0.0
+ return [0.0, 1.0-((r+smooth)/c)].min
end
def BLEU::bleu counts, n, debug=false
corpus_stats = NgramCounts.new n
counts.each { |i| corpus_stats.plus_eq i }
- sum = 0.0
- w = 1.0/n
+ logbleu = 0.0
0.upto(n-1) { |m|
STDERR.write "#{m+1} #{corpus_stats.clipped[m]} / #{corpus_stats.sum[m]}\n" if debug
return 0.0 if corpus_stats.clipped[m] == 0 or corpus_stats.sum == 0
- sum += w * Math.log(corpus_stats.clipped[m] / corpus_stats.sum[m])
+ logbleu += Math.log(corpus_stats.clipped[m]) - Math.log(corpus_stats.sum[m])
}
+ logbleu /= n
if debug
STDERR.write "BP #{brevity_penalty(corpus_stats.hyp_len, corpus_stats.ref_len)}\n"
STDERR.write "sum #{Math.exp(sum)}\n"
end
- return brevity_penalty(corpus_stats.hyp_len, corpus_stats.ref_len) * Math.exp(sum)
+ logbleu += brevity_penalty corpus_stats.hyp_len, corpus_stats.ref_len
+ return Math.exp logbleu
end
def BLEU::hbleu counts, n, debug=false
(100*bleu(counts, n, debug)).round(3)
end
-def BLEU::per_sentence_bleu hypothesis, reference, n=4, hack=0.0
+def BLEU::per_sentence_bleu hypothesis, reference, n=4, smooth=0.0
h_ng = {}; r_ng = {}
- (1).upto(n) {|i| h_ng[i] = []; r_ng[i] = []}
- ngrams(hypothesis, n) {|i| h_ng[i.size] << i}
- ngrams(reference, n) {|i| r_ng[i.size] << i}
+ (1).upto(n) { |i| h_ng[i] = []; r_ng[i] = [] }
+ ngrams(hypothesis, n) { |i| h_ng[i.size] << i }
+ ngrams(reference, n) { |i| r_ng[i.size] << i }
m = [n, reference.split.size].min
- weight = 1.0/m
add = 0.0
- sum = 0
+ logbleu = 0.0
(1).upto(m) { |i|
counts_clipped = 0
counts_sum = h_ng[i].size
h_ng[i].uniq.each { |j| counts_clipped += r_ng[i].count(j) }
add = 1.0 if i >= 2
- sum += weight * Math.log((counts_clipped + add)/(counts_sum + add));
+ logbleu += Math.log(counts_clipped+add) - Math.log(counts_sum+add);
}
- return brevity_penalty(hypothesis.strip.split.size, reference.strip.split.size) * Math.exp(sum)
+ logbleu /= m
+ logbleu += brevity_penalty hypothesis.strip.split.size, reference.strip.split.size, smooth
+ return Math.exp logbleu
end