diff options
Diffstat (limited to 'lib/nlp_ruby/bleu.rb')
-rw-r--r-- | lib/nlp_ruby/bleu.rb | 110 |
1 files changed, 110 insertions, 0 deletions
diff --git a/lib/nlp_ruby/bleu.rb b/lib/nlp_ruby/bleu.rb new file mode 100644 index 0000000..42be45e --- /dev/null +++ b/lib/nlp_ruby/bleu.rb @@ -0,0 +1,110 @@ +module BLEU + + +class BLEU::NgramCounts + attr_accessor :sum, :clipped, :ref_len, :hyp_len, :n + + def initialize(n) + @n = 0 + @sum = [] + @clipped = [] + @ref_len = 0.0 + @hyp_len = 0.0 + grow(n) + end + + def grow(n) + (n-@n).times { + @sum << 0.0 + @clipped << 0.0 + } + @n = n + end + + def plus_eq(other) + if other.n > @n then grow(other.n) end + 0.upto(other.n-1) { |m| + @sum[m] += other.sum[m] + @clipped[m] += other.clipped[m] + } + @ref_len += other.ref_len + @hyp_len += other.hyp_len + end + + def to_s + return "n=#{n} sum=#{sum} clipped=#{clipped} ref_len=#{ref_len} hyp_len=#{hyp_len}" + end +end + +class BLEU::Ngrams + def initialize + @h_ = {} + @h_.default = 0 + end + + def add(k) + if k.class == Array then k = k.join ' ' end + @h_[k] += 1 + end + + def get_count(k) + if k.class == Array then k = k.join ' ' end + return @h_[k] + end + + def each + @h_.each_pair { |k,v| + yield k.split, v + } + end + + def to_s + @h_.to_s + end +end + +def BLEU::get_counts hypothesis, reference, n, times=1 + p = NgramCounts.new n + r = Ngrams.new + ngrams(reference, n) { |ng| r.add ng } + h = Ngrams.new + ngrams(hypothesis, n) { |ng| h.add ng } + h.each { |ng,count| + sz = ng.size-1 + p.sum[sz] += count * times + p.clipped[sz] += [r.get_count(ng), count].min * times + } + p.ref_len = tokenize(reference.strip).size * times + p.hyp_len = tokenize(hypothesis.strip).size * times + return p +end + +def BLEU::brevity_penalty(c, r) + if c > r then return 1.0 end + return Math.exp(1-r/c) +end + +def BLEU::bleu(counts, n, debug=false) + corpus_stats = NgramCounts.new n + counts.each { |i| corpus_stats.plus_eq i } + sum = 0.0 + w = 1.0/n + 0.upto(n-1) { |m| + STDERR.write "#{m+1} #{corpus_stats.clipped[m]} / #{corpus_stats.sum[m]}\n" if debug + return 0.0 if corpus_stats.clipped[m] == 0 or corpus_stats.sum == 0 + sum += w * Math.log(corpus_stats.clipped[m] / corpus_stats.sum[m]) + } + if debug + STDERR.write "BP #{brevity_penalty(corpus_stats.hyp_len, corpus_stats.ref_len)}\n" + STDERR.write "sum #{Math.exp(sum)}\n" + end + return brevity_penalty(corpus_stats.hyp_len, corpus_stats.ref_len) * Math.exp(sum) +end + +def BLEU::hbleu counts, n, debug=false + (100*bleu(counts, n, debug)).round(3) +end + + +end + |