bleu, more methods for SparseVector, misc => bump to 0.2

author: Patrick Simianer <p@simianer.de> 2014-02-05 22:39:35 +0100
committer: Patrick Simianer <p@simianer.de> 2014-02-05 22:39:35 +0100
commit: 3db876b9fbd93670e421f0ddb627ca7463330533 (patch)
tree: 8e0b9b5abd09dc6d479fe76f21a97ab915e7ed8d
parent: 4228c0af3c550a85d37b5565a806b8864a774c83 (diff)
8 files changed, 242 insertions, 8 deletions
diff --git a/lib/nlp_ruby.rb b/lib/nlp_ruby.rb
index b80f893..212c367 100755
--- a/lib/nlp_ruby.rb
+++ b/lib/nlp_ruby.rb
@@ -9,6 +9,8 @@ require 'nlp_ruby/tfidf'
 require 'nlp_ruby/ttable'
 require 'nlp_ruby/dags'
 require 'nlp_ruby/semirings'
+require 'nlp_ruby/bleu'
+require 'nlp_ruby/misc'
 
 STDIN.set_encoding 'utf-8'
 STDOUT.set_encoding 'utf-8'
diff --git a/lib/nlp_ruby/SparseVector.rb b/lib/nlp_ruby/SparseVector.rb
index 0033690..9919a65 100644
--- a/lib/nlp_ruby/SparseVector.rb
+++ b/lib/nlp_ruby/SparseVector.rb
@@ -5,10 +5,14 @@ class SparseVector < Hash
     self.default = 0
   end
 
-  def from_hash h
+  def from_h h
     h.each_pair { |k,v| self[k] = v }
   end
 
+  def from_s s
+    from_h eval(s)
+  end
+
   def sum
     self.values.inject(:+)
   end
@@ -48,6 +52,43 @@ class SparseVector < Hash
     dims.each { |d| sum += (self[d] - other[d])**2 }
     return Math.sqrt(sum)
   end
+
+  def to_kv
+    a = []
+    self.each_pair { |k,v|
+      a << "#{k}=#{v}"
+    }
+    return a.join ' '
+  end
+
+  def join_keys other
+    self.keys + other.keys
+  end
+
+  def + other
+    new = SparseVector.new
+    join_keys(other).each { |k|
+      new[k] = self[k]+other[k]
+    }
+    return new
+  end
+
+  def - other
+    new = SparseVector.new
+    join_keys(other).each { |k|
+      new[k] = self[k]-other[k]
+    }
+    return new
+  end
+
+  def * scalar
+    raise ArgumentError, "Arg is not numeric #{scalar}" unless scalar.is_a? Numeric
+    new = SparseVector.new
+    self.keys.each { |k|
+      new[k] = self[k] * scalar
+    }
+    return new
+  end
 end
 
 def mean_sparse_vector array_of_vectors
diff --git a/lib/nlp_ruby/bleu.rb b/lib/nlp_ruby/bleu.rb
new file mode 100644
index 0000000..42be45e
--- /dev/null
+++ b/lib/nlp_ruby/bleu.rb
@@ -0,0 +1,110 @@
+module BLEU
+
+
+class BLEU::NgramCounts
+  attr_accessor :sum, :clipped, :ref_len, :hyp_len, :n
+
+  def initialize(n)
+    @n = 0
+    @sum = []
+    @clipped = []
+    @ref_len = 0.0
+    @hyp_len = 0.0
+    grow(n)
+  end
+
+  def grow(n)
+    (n-@n).times {
+      @sum << 0.0
+      @clipped << 0.0
+    }
+    @n = n
+  end
+
+  def plus_eq(other)
+    if other.n > @n then grow(other.n) end
+    0.upto(other.n-1) { |m|
+      @sum[m] += other.sum[m]
+      @clipped[m] += other.clipped[m]
+    }
+    @ref_len += other.ref_len
+    @hyp_len += other.hyp_len
+  end
+
+  def to_s
+    return "n=#{n} sum=#{sum} clipped=#{clipped} ref_len=#{ref_len} hyp_len=#{hyp_len}"
+  end
+end
+
+class BLEU::Ngrams
+  def initialize
+    @h_ = {}
+    @h_.default = 0
+  end
+
+  def add(k)
+    if k.class == Array then k = k.join ' ' end
+    @h_[k] += 1
+  end
+
+  def get_count(k)
+    if k.class == Array then k = k.join ' ' end
+    return @h_[k]
+  end
+
+  def each
+    @h_.each_pair { |k,v|
+      yield k.split, v
+    }
+  end
+
+  def to_s
+    @h_.to_s
+  end
+end
+
+def BLEU::get_counts hypothesis, reference, n, times=1
+  p = NgramCounts.new n
+  r = Ngrams.new
+  ngrams(reference, n) { |ng| r.add ng }
+  h = Ngrams.new
+  ngrams(hypothesis, n) { |ng| h.add ng }
+  h.each { |ng,count|
+    sz = ng.size-1
+    p.sum[sz] += count * times
+    p.clipped[sz] += [r.get_count(ng), count].min * times
+  }
+  p.ref_len = tokenize(reference.strip).size * times
+  p.hyp_len = tokenize(hypothesis.strip).size * times
+  return p
+end
+
+def BLEU::brevity_penalty(c, r)
+  if c > r then return 1.0 end
+  return Math.exp(1-r/c)
+end
+
+def BLEU::bleu(counts, n, debug=false)
+  corpus_stats = NgramCounts.new n
+  counts.each { |i| corpus_stats.plus_eq i }
+  sum = 0.0
+  w = 1.0/n
+  0.upto(n-1) { |m|
+    STDERR.write "#{m+1} #{corpus_stats.clipped[m]} / #{corpus_stats.sum[m]}\n" if debug
+    return 0.0 if corpus_stats.clipped[m] == 0 or corpus_stats.sum == 0
+    sum += w * Math.log(corpus_stats.clipped[m] / corpus_stats.sum[m])
+  }
+  if debug
+    STDERR.write "BP #{brevity_penalty(corpus_stats.hyp_len, corpus_stats.ref_len)}\n"
+    STDERR.write "sum #{Math.exp(sum)}\n"
+  end
+  return brevity_penalty(corpus_stats.hyp_len, corpus_stats.ref_len) * Math.exp(sum)
+end
+
+def BLEU::hbleu counts, n, debug=false
+  (100*bleu(counts, n, debug)).round(3)
+end
+
+
+end
+
diff --git a/lib/nlp_ruby/fileutil.rb b/lib/nlp_ruby/fileutil.rb
index 825ceb4..e560aae 100644
--- a/lib/nlp_ruby/fileutil.rb
+++ b/lib/nlp_ruby/fileutil.rb
@@ -40,7 +40,7 @@ class WriteFile
 
   def initialize fn, encoding='utf-8'
     if fn.split('.').last == 'gz'
-      @f = Zlib::GzipWrite.new(File.new(fn, 'wb+'), :external_encoding=>encoding)
+      @f = Zlib::GzipWriter.new(File.new(fn, 'wb+'), :external_encoding=>encoding)
     elsif fn == '-'
       @f = STDOUT
       STDOUT.set_encoding encoding
diff --git a/lib/nlp_ruby/misc.rb b/lib/nlp_ruby/misc.rb
new file mode 100644
index 0000000..9a4064f
--- /dev/null
+++ b/lib/nlp_ruby/misc.rb
@@ -0,0 +1,6 @@
+class Array
+  def max_index
+    self.index(self.max)
+  end
+end
+
diff --git a/lib/nlp_ruby/stringutil.rb b/lib/nlp_ruby/stringutil.rb
index e9a3bc9..4091994 100644
--- a/lib/nlp_ruby/stringutil.rb
+++ b/lib/nlp_ruby/stringutil.rb
@@ -3,8 +3,8 @@ def tokenize s
   s.strip.split
 end
 
-def splitpipe s
-  s.strip.split(/\s*\|\|\|\s*/)
+def splitpipe s, n=3
+  s.strip.split("|"*n)
 end
 
 def downcase? s
@@ -32,3 +32,19 @@ def read_feature_string s
   return map
 end
 
+
+def read_cfg fn
+  f = ReadFile.new fn
+  cfg = {}
+  while line = f.gets
+    line.strip!
+    next if /^\s*$/.match line
+    next if line[0]=='#'
+    content = line.split('#', 2).first
+    k, v = content.split(/\s*=\s*/, 2)
+    k.strip!; v.strip!
+    cfg[k] = v
+  end
+  return cfg
+end
+
diff --git a/lib/nlp_ruby/ttable.rb b/lib/nlp_ruby/ttable.rb
index 20b1412..598e318 100644
--- a/lib/nlp_ruby/ttable.rb
+++ b/lib/nlp_ruby/ttable.rb
@@ -15,3 +15,62 @@ def read_phrase_table fn
   return table
 end
 
+class Translation
+  attr_accessor :id, :s, :raw, :f, :score
+
+  def initialize id=nil, raw=nil, s=nil, f=nil, score=nil
+    @id = id
+    @raw = raw
+    @s = s
+    @f = f
+    @score = score
+  end
+
+  def from_s t, strip_alignment=true
+    id, raw, features, score = splitpipe(t, 3)
+    raw.strip!
+    @raw = raw
+    if strip_alignment # the way moses does it
+      @s = @raw.gsub(/\s*\|\d+-\d+\||\|-?\d+\|\s*/, ' ').gsub(/\s+/, ' ')
+      @s.strip!
+    else
+      @s = raw
+    end
+    @id = id.to_i
+    @f = read_feature_string features
+    @score = score.to_f
+  end
+
+  def to_s
+    [id, s, f.to_kv, score].join ' ||| '
+  end
+end
+
+def read_kbest_lists fn, translation_type=Translation
+  kbest_lists = []
+  cur = []
+  f = ReadFile.new fn
+  prev = -1
+  c = 0
+  id = 0
+  while line = f.gets
+    t = translation_type.new
+    t.from_s line
+    c = splitpipe(line)[0].to_i
+    if c != prev
+      if cur.size > 0
+        kbest_lists << cur
+        cur = []
+      end
+      prev = c
+      id = 0
+    end
+    t.id = id
+    cur << t
+    id += 1
+  end
+  kbest_lists << cur # last one
+  f.close
+  return kbest_lists
+end
+
diff --git a/nlp_ruby.gemspec b/nlp_ruby.gemspec
index 0737994..66716ad 100644
--- a/nlp_ruby.gemspec
+++ b/nlp_ruby.gemspec
@@ -1,9 +1,9 @@
 Gem::Specification.new do |s|
   s.name        = 'nlp_ruby'
-  s.version     = '0.1'
-  s.date        = '2014-01-29'
-  s.summary     = "nlp_ruby"
-  s.description = "NLP related tools and classes"
+  s.version     = '0.2'
+  s.date        = '2014-02-05'
+  s.summary     = 'nlp_ruby'
+  s.description = 'NLP related tools and classes'
   s.authors     = ["Patrick Simianer"]
   s.email       = 'p@simianer.de'
   s.files       = Dir['lib/*.rb', 'lib/nlp_ruby/*.rb']
author	Patrick Simianer <p@simianer.de>	2014-02-05 22:39:35 +0100
committer	Patrick Simianer <p@simianer.de>	2014-02-05 22:39:35 +0100
commit	3db876b9fbd93670e421f0ddb627ca7463330533 (patch)
tree	8e0b9b5abd09dc6d479fe76f21a97ab915e7ed8d
parent	4228c0af3c550a85d37b5565a806b8864a774c83 (diff)