From 3db876b9fbd93670e421f0ddb627ca7463330533 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Wed, 5 Feb 2014 22:39:35 +0100 Subject: bleu, more methods for SparseVector, misc => bump to 0.2 --- lib/nlp_ruby.rb | 2 + lib/nlp_ruby/SparseVector.rb | 43 ++++++++++++++++- lib/nlp_ruby/bleu.rb | 110 +++++++++++++++++++++++++++++++++++++++++++ lib/nlp_ruby/fileutil.rb | 2 +- lib/nlp_ruby/misc.rb | 6 +++ lib/nlp_ruby/stringutil.rb | 20 +++++++- lib/nlp_ruby/ttable.rb | 59 +++++++++++++++++++++++ nlp_ruby.gemspec | 8 ++-- 8 files changed, 242 insertions(+), 8 deletions(-) create mode 100644 lib/nlp_ruby/bleu.rb create mode 100644 lib/nlp_ruby/misc.rb diff --git a/lib/nlp_ruby.rb b/lib/nlp_ruby.rb index b80f893..212c367 100755 --- a/lib/nlp_ruby.rb +++ b/lib/nlp_ruby.rb @@ -9,6 +9,8 @@ require 'nlp_ruby/tfidf' require 'nlp_ruby/ttable' require 'nlp_ruby/dags' require 'nlp_ruby/semirings' +require 'nlp_ruby/bleu' +require 'nlp_ruby/misc' STDIN.set_encoding 'utf-8' STDOUT.set_encoding 'utf-8' diff --git a/lib/nlp_ruby/SparseVector.rb b/lib/nlp_ruby/SparseVector.rb index 0033690..9919a65 100644 --- a/lib/nlp_ruby/SparseVector.rb +++ b/lib/nlp_ruby/SparseVector.rb @@ -5,10 +5,14 @@ class SparseVector < Hash self.default = 0 end - def from_hash h + def from_h h h.each_pair { |k,v| self[k] = v } end + def from_s s + from_h eval(s) + end + def sum self.values.inject(:+) end @@ -48,6 +52,43 @@ class SparseVector < Hash dims.each { |d| sum += (self[d] - other[d])**2 } return Math.sqrt(sum) end + + def to_kv + a = [] + self.each_pair { |k,v| + a << "#{k}=#{v}" + } + return a.join ' ' + end + + def join_keys other + self.keys + other.keys + end + + def + other + new = SparseVector.new + join_keys(other).each { |k| + new[k] = self[k]+other[k] + } + return new + end + + def - other + new = SparseVector.new + join_keys(other).each { |k| + new[k] = self[k]-other[k] + } + return new + end + + def * scalar + raise ArgumentError, "Arg is not numeric #{scalar}" unless scalar.is_a? Numeric + new = SparseVector.new + self.keys.each { |k| + new[k] = self[k] * scalar + } + return new + end end def mean_sparse_vector array_of_vectors diff --git a/lib/nlp_ruby/bleu.rb b/lib/nlp_ruby/bleu.rb new file mode 100644 index 0000000..42be45e --- /dev/null +++ b/lib/nlp_ruby/bleu.rb @@ -0,0 +1,110 @@ +module BLEU + + +class BLEU::NgramCounts + attr_accessor :sum, :clipped, :ref_len, :hyp_len, :n + + def initialize(n) + @n = 0 + @sum = [] + @clipped = [] + @ref_len = 0.0 + @hyp_len = 0.0 + grow(n) + end + + def grow(n) + (n-@n).times { + @sum << 0.0 + @clipped << 0.0 + } + @n = n + end + + def plus_eq(other) + if other.n > @n then grow(other.n) end + 0.upto(other.n-1) { |m| + @sum[m] += other.sum[m] + @clipped[m] += other.clipped[m] + } + @ref_len += other.ref_len + @hyp_len += other.hyp_len + end + + def to_s + return "n=#{n} sum=#{sum} clipped=#{clipped} ref_len=#{ref_len} hyp_len=#{hyp_len}" + end +end + +class BLEU::Ngrams + def initialize + @h_ = {} + @h_.default = 0 + end + + def add(k) + if k.class == Array then k = k.join ' ' end + @h_[k] += 1 + end + + def get_count(k) + if k.class == Array then k = k.join ' ' end + return @h_[k] + end + + def each + @h_.each_pair { |k,v| + yield k.split, v + } + end + + def to_s + @h_.to_s + end +end + +def BLEU::get_counts hypothesis, reference, n, times=1 + p = NgramCounts.new n + r = Ngrams.new + ngrams(reference, n) { |ng| r.add ng } + h = Ngrams.new + ngrams(hypothesis, n) { |ng| h.add ng } + h.each { |ng,count| + sz = ng.size-1 + p.sum[sz] += count * times + p.clipped[sz] += [r.get_count(ng), count].min * times + } + p.ref_len = tokenize(reference.strip).size * times + p.hyp_len = tokenize(hypothesis.strip).size * times + return p +end + +def BLEU::brevity_penalty(c, r) + if c > r then return 1.0 end + return Math.exp(1-r/c) +end + +def BLEU::bleu(counts, n, debug=false) + corpus_stats = NgramCounts.new n + counts.each { |i| corpus_stats.plus_eq i } + sum = 0.0 + w = 1.0/n + 0.upto(n-1) { |m| + STDERR.write "#{m+1} #{corpus_stats.clipped[m]} / #{corpus_stats.sum[m]}\n" if debug + return 0.0 if corpus_stats.clipped[m] == 0 or corpus_stats.sum == 0 + sum += w * Math.log(corpus_stats.clipped[m] / corpus_stats.sum[m]) + } + if debug + STDERR.write "BP #{brevity_penalty(corpus_stats.hyp_len, corpus_stats.ref_len)}\n" + STDERR.write "sum #{Math.exp(sum)}\n" + end + return brevity_penalty(corpus_stats.hyp_len, corpus_stats.ref_len) * Math.exp(sum) +end + +def BLEU::hbleu counts, n, debug=false + (100*bleu(counts, n, debug)).round(3) +end + + +end + diff --git a/lib/nlp_ruby/fileutil.rb b/lib/nlp_ruby/fileutil.rb index 825ceb4..e560aae 100644 --- a/lib/nlp_ruby/fileutil.rb +++ b/lib/nlp_ruby/fileutil.rb @@ -40,7 +40,7 @@ class WriteFile def initialize fn, encoding='utf-8' if fn.split('.').last == 'gz' - @f = Zlib::GzipWrite.new(File.new(fn, 'wb+'), :external_encoding=>encoding) + @f = Zlib::GzipWriter.new(File.new(fn, 'wb+'), :external_encoding=>encoding) elsif fn == '-' @f = STDOUT STDOUT.set_encoding encoding diff --git a/lib/nlp_ruby/misc.rb b/lib/nlp_ruby/misc.rb new file mode 100644 index 0000000..9a4064f --- /dev/null +++ b/lib/nlp_ruby/misc.rb @@ -0,0 +1,6 @@ +class Array + def max_index + self.index(self.max) + end +end + diff --git a/lib/nlp_ruby/stringutil.rb b/lib/nlp_ruby/stringutil.rb index e9a3bc9..4091994 100644 --- a/lib/nlp_ruby/stringutil.rb +++ b/lib/nlp_ruby/stringutil.rb @@ -3,8 +3,8 @@ def tokenize s s.strip.split end -def splitpipe s - s.strip.split(/\s*\|\|\|\s*/) +def splitpipe s, n=3 + s.strip.split("|"*n) end def downcase? s @@ -32,3 +32,19 @@ def read_feature_string s return map end + +def read_cfg fn + f = ReadFile.new fn + cfg = {} + while line = f.gets + line.strip! + next if /^\s*$/.match line + next if line[0]=='#' + content = line.split('#', 2).first + k, v = content.split(/\s*=\s*/, 2) + k.strip!; v.strip! + cfg[k] = v + end + return cfg +end + diff --git a/lib/nlp_ruby/ttable.rb b/lib/nlp_ruby/ttable.rb index 20b1412..598e318 100644 --- a/lib/nlp_ruby/ttable.rb +++ b/lib/nlp_ruby/ttable.rb @@ -15,3 +15,62 @@ def read_phrase_table fn return table end +class Translation + attr_accessor :id, :s, :raw, :f, :score + + def initialize id=nil, raw=nil, s=nil, f=nil, score=nil + @id = id + @raw = raw + @s = s + @f = f + @score = score + end + + def from_s t, strip_alignment=true + id, raw, features, score = splitpipe(t, 3) + raw.strip! + @raw = raw + if strip_alignment # the way moses does it + @s = @raw.gsub(/\s*\|\d+-\d+\||\|-?\d+\|\s*/, ' ').gsub(/\s+/, ' ') + @s.strip! + else + @s = raw + end + @id = id.to_i + @f = read_feature_string features + @score = score.to_f + end + + def to_s + [id, s, f.to_kv, score].join ' ||| ' + end +end + +def read_kbest_lists fn, translation_type=Translation + kbest_lists = [] + cur = [] + f = ReadFile.new fn + prev = -1 + c = 0 + id = 0 + while line = f.gets + t = translation_type.new + t.from_s line + c = splitpipe(line)[0].to_i + if c != prev + if cur.size > 0 + kbest_lists << cur + cur = [] + end + prev = c + id = 0 + end + t.id = id + cur << t + id += 1 + end + kbest_lists << cur # last one + f.close + return kbest_lists +end + diff --git a/nlp_ruby.gemspec b/nlp_ruby.gemspec index 0737994..66716ad 100644 --- a/nlp_ruby.gemspec +++ b/nlp_ruby.gemspec @@ -1,9 +1,9 @@ Gem::Specification.new do |s| s.name = 'nlp_ruby' - s.version = '0.1' - s.date = '2014-01-29' - s.summary = "nlp_ruby" - s.description = "NLP related tools and classes" + s.version = '0.2' + s.date = '2014-02-05' + s.summary = 'nlp_ruby' + s.description = 'NLP related tools and classes' s.authors = ["Patrick Simianer"] s.email = 'p@simianer.de' s.files = Dir['lib/*.rb', 'lib/nlp_ruby/*.rb'] -- cgit v1.2.3