summaryrefslogtreecommitdiff
path: root/lib/nlp_ruby
diff options
context:
space:
mode:
authorPatrick Simianer <simianer@cl.uni-heidelberg.de>2014-02-12 17:44:06 +0100
committerPatrick Simianer <simianer@cl.uni-heidelberg.de>2014-02-12 17:44:06 +0100
commit27bc315543a4e3002e5d4ec0e37be3dcc2e3114e (patch)
tree8887ee8b5ce2bf6f6fc2c885aafef7340a265226 /lib/nlp_ruby
parentf69ba1155e5f51dce0669bcf3e79a4c230e120d2 (diff)
per sentence bleu, spawn with timeout
Diffstat (limited to 'lib/nlp_ruby')
-rw-r--r--lib/nlp_ruby/SparseVector.rb15
-rw-r--r--lib/nlp_ruby/bleu.rb25
-rw-r--r--lib/nlp_ruby/misc.rb26
-rw-r--r--lib/nlp_ruby/stringutil.rb5
-rw-r--r--lib/nlp_ruby/ttable.rb17
5 files changed, 79 insertions, 9 deletions
diff --git a/lib/nlp_ruby/SparseVector.rb b/lib/nlp_ruby/SparseVector.rb
index 9919a65..964ef4e 100644
--- a/lib/nlp_ruby/SparseVector.rb
+++ b/lib/nlp_ruby/SparseVector.rb
@@ -53,10 +53,21 @@ class SparseVector < Hash
return Math.sqrt(sum)
end
- def to_kv
+ # FIXME
+ def from_kv_file fn, sep=' '
+ f = ReadFile.new(fn)
+ while line = f.gets
+ key, value = line.strip.split sep
+ value = value.to_f
+ self[key] = value
+ end
+ end
+
+ # FIXME
+ def to_kv sep='='
a = []
self.each_pair { |k,v|
- a << "#{k}=#{v}"
+ a << "#{k}#{sep}#{v}"
}
return a.join ' '
end
diff --git a/lib/nlp_ruby/bleu.rb b/lib/nlp_ruby/bleu.rb
index 42be45e..ee91985 100644
--- a/lib/nlp_ruby/bleu.rb
+++ b/lib/nlp_ruby/bleu.rb
@@ -79,12 +79,12 @@ def BLEU::get_counts hypothesis, reference, n, times=1
return p
end
-def BLEU::brevity_penalty(c, r)
- if c > r then return 1.0 end
+def BLEU::brevity_penalty c, r
+ return 1.0 if c>r
return Math.exp(1-r/c)
end
-def BLEU::bleu(counts, n, debug=false)
+def BLEU::bleu counts, n, debug=false
corpus_stats = NgramCounts.new n
counts.each { |i| corpus_stats.plus_eq i }
sum = 0.0
@@ -105,6 +105,25 @@ def BLEU::hbleu counts, n, debug=false
(100*bleu(counts, n, debug)).round(3)
end
+def BLEU::per_sentence_bleu hypothesis, reference, n=4
+ h_ng = {}; r_ng = {}
+ (1).upto(n) {|i| h_ng[i] = []; r_ng[i] = []}
+ ngrams(hypothesis, n) {|i| h_ng[i.size] << i}
+ ngrams(reference, n) {|i| r_ng[i.size] << i}
+ m = [n, reference.split.size].min
+ weight = 1.0/m
+ add = 0.0
+ sum = 0
+ (1).upto(m) { |i|
+ counts_clipped = 0
+ counts_sum = h_ng[i].size
+ h_ng[i].uniq.each {|j| counts_clipped += r_ng[i].count(j)}
+ add = 1.0 if i >= 2
+ sum += weight * Math.log((counts_clipped + add)/(counts_sum + add));
+ }
+ return brevity_penalty(hypothesis.strip.split.size, reference.strip.split.size) * Math.exp(sum)
+end
+
end
diff --git a/lib/nlp_ruby/misc.rb b/lib/nlp_ruby/misc.rb
index 9a4064f..1fa3878 100644
--- a/lib/nlp_ruby/misc.rb
+++ b/lib/nlp_ruby/misc.rb
@@ -2,5 +2,31 @@ class Array
def max_index
self.index(self.max)
end
+
+ def is_subset_of? other
+ self.each { |i|
+ if other.include? i
+ return false
+ end
+ }
+ return true
+ end
+end
+
+def spawn_with_timeout cmd, t=4, debug=false
+ require 'timeout'
+ STDERR.write cmd+"\n" if debug
+ pipe_in, pipe_out = IO.pipe
+ pid = Process.spawn(cmd, :out => pipe_out)
+ begin
+ Timeout.timeout(t) { Process.wait pid }
+ rescue Timeout::Error
+ return ""
+ # accept the zombies
+ #Process.kill('TERM', pid)
+ end
+ pipe_out.close
+ return pipe_in.read
end
+
diff --git a/lib/nlp_ruby/stringutil.rb b/lib/nlp_ruby/stringutil.rb
index 4091994..d7381bb 100644
--- a/lib/nlp_ruby/stringutil.rb
+++ b/lib/nlp_ruby/stringutil.rb
@@ -48,3 +48,8 @@ def read_cfg fn
return cfg
end
+def bag_of_words s, stopwords=[]
+ s.strip.split.uniq.sort.reject{ |w| stopwords.include? w }
+end
+
+
diff --git a/lib/nlp_ruby/ttable.rb b/lib/nlp_ruby/ttable.rb
index 598e318..14d6c5d 100644
--- a/lib/nlp_ruby/ttable.rb
+++ b/lib/nlp_ruby/ttable.rb
@@ -15,18 +15,21 @@ def read_phrase_table fn
return table
end
+# FIXME
class Translation
- attr_accessor :id, :s, :raw, :f, :score
+ attr_accessor :id, :s, :raw, :f, :score, :rank, :other_score
- def initialize id=nil, raw=nil, s=nil, f=nil, score=nil
+ def initialize id=nil, raw=nil, s=nil, f=nil, score=nil, rank=nil, other_score=nil
@id = id
@raw = raw
@s = s
@f = f
@score = score
+ @rank = rank
+ @other_score = other_score
end
- def from_s t, strip_alignment=true
+ def from_s t, strip_alignment=true, rank=nil
id, raw, features, score = splitpipe(t, 3)
raw.strip!
@raw = raw
@@ -39,10 +42,16 @@ class Translation
@id = id.to_i
@f = read_feature_string features
@score = score.to_f
+ @rank = rank
+ @other_score = nil
end
def to_s
- [id, s, f.to_kv, score].join ' ||| '
+ [@id, @s, @f.to_kv, @score].join ' ||| '
+ end
+
+ def to_s2
+ [@rank, @s, @f.to_kv, @score, @other_score].join ' ||| '
end
end