summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2014-02-05 22:39:35 +0100
committerPatrick Simianer <p@simianer.de>2014-02-05 22:39:35 +0100
commit3db876b9fbd93670e421f0ddb627ca7463330533 (patch)
tree8e0b9b5abd09dc6d479fe76f21a97ab915e7ed8d
parent4228c0af3c550a85d37b5565a806b8864a774c83 (diff)
bleu, more methods for SparseVector, misc => bump to 0.2
-rwxr-xr-xlib/nlp_ruby.rb2
-rw-r--r--lib/nlp_ruby/SparseVector.rb43
-rw-r--r--lib/nlp_ruby/bleu.rb110
-rw-r--r--lib/nlp_ruby/fileutil.rb2
-rw-r--r--lib/nlp_ruby/misc.rb6
-rw-r--r--lib/nlp_ruby/stringutil.rb20
-rw-r--r--lib/nlp_ruby/ttable.rb59
-rw-r--r--nlp_ruby.gemspec8
8 files changed, 242 insertions, 8 deletions
diff --git a/lib/nlp_ruby.rb b/lib/nlp_ruby.rb
index b80f893..212c367 100755
--- a/lib/nlp_ruby.rb
+++ b/lib/nlp_ruby.rb
@@ -9,6 +9,8 @@ require 'nlp_ruby/tfidf'
require 'nlp_ruby/ttable'
require 'nlp_ruby/dags'
require 'nlp_ruby/semirings'
+require 'nlp_ruby/bleu'
+require 'nlp_ruby/misc'
STDIN.set_encoding 'utf-8'
STDOUT.set_encoding 'utf-8'
diff --git a/lib/nlp_ruby/SparseVector.rb b/lib/nlp_ruby/SparseVector.rb
index 0033690..9919a65 100644
--- a/lib/nlp_ruby/SparseVector.rb
+++ b/lib/nlp_ruby/SparseVector.rb
@@ -5,10 +5,14 @@ class SparseVector < Hash
self.default = 0
end
- def from_hash h
+ def from_h h
h.each_pair { |k,v| self[k] = v }
end
+ def from_s s
+ from_h eval(s)
+ end
+
def sum
self.values.inject(:+)
end
@@ -48,6 +52,43 @@ class SparseVector < Hash
dims.each { |d| sum += (self[d] - other[d])**2 }
return Math.sqrt(sum)
end
+
+ def to_kv
+ a = []
+ self.each_pair { |k,v|
+ a << "#{k}=#{v}"
+ }
+ return a.join ' '
+ end
+
+ def join_keys other
+ self.keys + other.keys
+ end
+
+ def + other
+ new = SparseVector.new
+ join_keys(other).each { |k|
+ new[k] = self[k]+other[k]
+ }
+ return new
+ end
+
+ def - other
+ new = SparseVector.new
+ join_keys(other).each { |k|
+ new[k] = self[k]-other[k]
+ }
+ return new
+ end
+
+ def * scalar
+ raise ArgumentError, "Arg is not numeric #{scalar}" unless scalar.is_a? Numeric
+ new = SparseVector.new
+ self.keys.each { |k|
+ new[k] = self[k] * scalar
+ }
+ return new
+ end
end
def mean_sparse_vector array_of_vectors
diff --git a/lib/nlp_ruby/bleu.rb b/lib/nlp_ruby/bleu.rb
new file mode 100644
index 0000000..42be45e
--- /dev/null
+++ b/lib/nlp_ruby/bleu.rb
@@ -0,0 +1,110 @@
+module BLEU
+
+
+class BLEU::NgramCounts
+ attr_accessor :sum, :clipped, :ref_len, :hyp_len, :n
+
+ def initialize(n)
+ @n = 0
+ @sum = []
+ @clipped = []
+ @ref_len = 0.0
+ @hyp_len = 0.0
+ grow(n)
+ end
+
+ def grow(n)
+ (n-@n).times {
+ @sum << 0.0
+ @clipped << 0.0
+ }
+ @n = n
+ end
+
+ def plus_eq(other)
+ if other.n > @n then grow(other.n) end
+ 0.upto(other.n-1) { |m|
+ @sum[m] += other.sum[m]
+ @clipped[m] += other.clipped[m]
+ }
+ @ref_len += other.ref_len
+ @hyp_len += other.hyp_len
+ end
+
+ def to_s
+ return "n=#{n} sum=#{sum} clipped=#{clipped} ref_len=#{ref_len} hyp_len=#{hyp_len}"
+ end
+end
+
+class BLEU::Ngrams
+ def initialize
+ @h_ = {}
+ @h_.default = 0
+ end
+
+ def add(k)
+ if k.class == Array then k = k.join ' ' end
+ @h_[k] += 1
+ end
+
+ def get_count(k)
+ if k.class == Array then k = k.join ' ' end
+ return @h_[k]
+ end
+
+ def each
+ @h_.each_pair { |k,v|
+ yield k.split, v
+ }
+ end
+
+ def to_s
+ @h_.to_s
+ end
+end
+
+def BLEU::get_counts hypothesis, reference, n, times=1
+ p = NgramCounts.new n
+ r = Ngrams.new
+ ngrams(reference, n) { |ng| r.add ng }
+ h = Ngrams.new
+ ngrams(hypothesis, n) { |ng| h.add ng }
+ h.each { |ng,count|
+ sz = ng.size-1
+ p.sum[sz] += count * times
+ p.clipped[sz] += [r.get_count(ng), count].min * times
+ }
+ p.ref_len = tokenize(reference.strip).size * times
+ p.hyp_len = tokenize(hypothesis.strip).size * times
+ return p
+end
+
+def BLEU::brevity_penalty(c, r)
+ if c > r then return 1.0 end
+ return Math.exp(1-r/c)
+end
+
+def BLEU::bleu(counts, n, debug=false)
+ corpus_stats = NgramCounts.new n
+ counts.each { |i| corpus_stats.plus_eq i }
+ sum = 0.0
+ w = 1.0/n
+ 0.upto(n-1) { |m|
+ STDERR.write "#{m+1} #{corpus_stats.clipped[m]} / #{corpus_stats.sum[m]}\n" if debug
+ return 0.0 if corpus_stats.clipped[m] == 0 or corpus_stats.sum == 0
+ sum += w * Math.log(corpus_stats.clipped[m] / corpus_stats.sum[m])
+ }
+ if debug
+ STDERR.write "BP #{brevity_penalty(corpus_stats.hyp_len, corpus_stats.ref_len)}\n"
+ STDERR.write "sum #{Math.exp(sum)}\n"
+ end
+ return brevity_penalty(corpus_stats.hyp_len, corpus_stats.ref_len) * Math.exp(sum)
+end
+
+def BLEU::hbleu counts, n, debug=false
+ (100*bleu(counts, n, debug)).round(3)
+end
+
+
+end
+
diff --git a/lib/nlp_ruby/fileutil.rb b/lib/nlp_ruby/fileutil.rb
index 825ceb4..e560aae 100644
--- a/lib/nlp_ruby/fileutil.rb
+++ b/lib/nlp_ruby/fileutil.rb
@@ -40,7 +40,7 @@ class WriteFile
def initialize fn, encoding='utf-8'
if fn.split('.').last == 'gz'
- @f = Zlib::GzipWrite.new(File.new(fn, 'wb+'), :external_encoding=>encoding)
+ @f = Zlib::GzipWriter.new(File.new(fn, 'wb+'), :external_encoding=>encoding)
elsif fn == '-'
@f = STDOUT
STDOUT.set_encoding encoding
diff --git a/lib/nlp_ruby/misc.rb b/lib/nlp_ruby/misc.rb
new file mode 100644
index 0000000..9a4064f
--- /dev/null
+++ b/lib/nlp_ruby/misc.rb
@@ -0,0 +1,6 @@
+class Array
+ def max_index
+ self.index(self.max)
+ end
+end
+
diff --git a/lib/nlp_ruby/stringutil.rb b/lib/nlp_ruby/stringutil.rb
index e9a3bc9..4091994 100644
--- a/lib/nlp_ruby/stringutil.rb
+++ b/lib/nlp_ruby/stringutil.rb
@@ -3,8 +3,8 @@ def tokenize s
s.strip.split
end
-def splitpipe s
- s.strip.split(/\s*\|\|\|\s*/)
+def splitpipe s, n=3
+ s.strip.split("|"*n)
end
def downcase? s
@@ -32,3 +32,19 @@ def read_feature_string s
return map
end
+
+def read_cfg fn
+ f = ReadFile.new fn
+ cfg = {}
+ while line = f.gets
+ line.strip!
+ next if /^\s*$/.match line
+ next if line[0]=='#'
+ content = line.split('#', 2).first
+ k, v = content.split(/\s*=\s*/, 2)
+ k.strip!; v.strip!
+ cfg[k] = v
+ end
+ return cfg
+end
+
diff --git a/lib/nlp_ruby/ttable.rb b/lib/nlp_ruby/ttable.rb
index 20b1412..598e318 100644
--- a/lib/nlp_ruby/ttable.rb
+++ b/lib/nlp_ruby/ttable.rb
@@ -15,3 +15,62 @@ def read_phrase_table fn
return table
end
+class Translation
+ attr_accessor :id, :s, :raw, :f, :score
+
+ def initialize id=nil, raw=nil, s=nil, f=nil, score=nil
+ @id = id
+ @raw = raw
+ @s = s
+ @f = f
+ @score = score
+ end
+
+ def from_s t, strip_alignment=true
+ id, raw, features, score = splitpipe(t, 3)
+ raw.strip!
+ @raw = raw
+ if strip_alignment # the way moses does it
+ @s = @raw.gsub(/\s*\|\d+-\d+\||\|-?\d+\|\s*/, ' ').gsub(/\s+/, ' ')
+ @s.strip!
+ else
+ @s = raw
+ end
+ @id = id.to_i
+ @f = read_feature_string features
+ @score = score.to_f
+ end
+
+ def to_s
+ [id, s, f.to_kv, score].join ' ||| '
+ end
+end
+
+def read_kbest_lists fn, translation_type=Translation
+ kbest_lists = []
+ cur = []
+ f = ReadFile.new fn
+ prev = -1
+ c = 0
+ id = 0
+ while line = f.gets
+ t = translation_type.new
+ t.from_s line
+ c = splitpipe(line)[0].to_i
+ if c != prev
+ if cur.size > 0
+ kbest_lists << cur
+ cur = []
+ end
+ prev = c
+ id = 0
+ end
+ t.id = id
+ cur << t
+ id += 1
+ end
+ kbest_lists << cur # last one
+ f.close
+ return kbest_lists
+end
+
diff --git a/nlp_ruby.gemspec b/nlp_ruby.gemspec
index 0737994..66716ad 100644
--- a/nlp_ruby.gemspec
+++ b/nlp_ruby.gemspec
@@ -1,9 +1,9 @@
Gem::Specification.new do |s|
s.name = 'nlp_ruby'
- s.version = '0.1'
- s.date = '2014-01-29'
- s.summary = "nlp_ruby"
- s.description = "NLP related tools and classes"
+ s.version = '0.2'
+ s.date = '2014-02-05'
+ s.summary = 'nlp_ruby'
+ s.description = 'NLP related tools and classes'
s.authors = ["Patrick Simianer"]
s.email = 'p@simianer.de'
s.files = Dir['lib/*.rb', 'lib/nlp_ruby/*.rb']