per sentence bleu, spawn with timeout

author: Patrick Simianer <simianer@cl.uni-heidelberg.de> 2014-02-12 17:44:06 +0100
committer: Patrick Simianer <simianer@cl.uni-heidelberg.de> 2014-02-12 17:44:06 +0100
commit: 27bc315543a4e3002e5d4ec0e37be3dcc2e3114e (patch)
tree: 8887ee8b5ce2bf6f6fc2c885aafef7340a265226 /lib/nlp_ruby
parent: f69ba1155e5f51dce0669bcf3e79a4c230e120d2 (diff)
5 files changed, 79 insertions, 9 deletions
diff --git a/lib/nlp_ruby/SparseVector.rb b/lib/nlp_ruby/SparseVector.rb
index 9919a65..964ef4e 100644
--- a/lib/nlp_ruby/SparseVector.rb
+++ b/lib/nlp_ruby/SparseVector.rb
@@ -53,10 +53,21 @@ class SparseVector < Hash
     return Math.sqrt(sum)
   end
 
-  def to_kv
+  # FIXME
+  def from_kv_file fn, sep=' '
+    f = ReadFile.new(fn)
+    while line = f.gets
+      key, value = line.strip.split sep
+      value = value.to_f
+      self[key] = value
+    end
+  end
+  
+  # FIXME
+  def to_kv sep='='
     a = []
     self.each_pair { |k,v|
-      a << "#{k}=#{v}"
+      a << "#{k}#{sep}#{v}"
     }
     return a.join ' '
   end
diff --git a/lib/nlp_ruby/bleu.rb b/lib/nlp_ruby/bleu.rb
index 42be45e..ee91985 100644
--- a/lib/nlp_ruby/bleu.rb
+++ b/lib/nlp_ruby/bleu.rb
@@ -79,12 +79,12 @@ def BLEU::get_counts hypothesis, reference, n, times=1
   return p
 end
 
-def BLEU::brevity_penalty(c, r)
-  if c > r then return 1.0 end
+def BLEU::brevity_penalty c, r
+  return 1.0 if c>r
   return Math.exp(1-r/c)
 end
 
-def BLEU::bleu(counts, n, debug=false)
+def BLEU::bleu counts, n, debug=false
   corpus_stats = NgramCounts.new n
   counts.each { |i| corpus_stats.plus_eq i }
   sum = 0.0
@@ -105,6 +105,25 @@ def BLEU::hbleu counts, n, debug=false
   (100*bleu(counts, n, debug)).round(3)
 end
 
+def BLEU::per_sentence_bleu hypothesis, reference, n=4
+  h_ng = {}; r_ng = {}
+  (1).upto(n) {|i| h_ng[i] = []; r_ng[i] = []}
+  ngrams(hypothesis, n) {|i| h_ng[i.size] << i}
+  ngrams(reference, n) {|i| r_ng[i.size] << i}
+  m = [n, reference.split.size].min
+  weight = 1.0/m
+  add = 0.0
+  sum = 0
+  (1).upto(m) { |i|
+    counts_clipped = 0
+    counts_sum = h_ng[i].size
+    h_ng[i].uniq.each {|j| counts_clipped += r_ng[i].count(j)}
+    add = 1.0 if i >= 2
+    sum += weight * Math.log((counts_clipped + add)/(counts_sum + add));
+  } 
+  return brevity_penalty(hypothesis.strip.split.size, reference.strip.split.size) * Math.exp(sum)
+end
+
 
 end
 
diff --git a/lib/nlp_ruby/misc.rb b/lib/nlp_ruby/misc.rb
index 9a4064f..1fa3878 100644
--- a/lib/nlp_ruby/misc.rb
+++ b/lib/nlp_ruby/misc.rb
@@ -2,5 +2,31 @@ class Array
   def max_index
     self.index(self.max)
   end
+
+  def is_subset_of? other
+    self.each { |i|
+      if other.include? i
+       return false
+      end
+    }
+    return true
+  end
+end
+
+def spawn_with_timeout cmd, t=4, debug=false
+  require 'timeout'
+  STDERR.write cmd+"\n" if debug
+  pipe_in, pipe_out = IO.pipe
+  pid = Process.spawn(cmd, :out => pipe_out)
+  begin
+    Timeout.timeout(t) { Process.wait pid }
+  rescue Timeout::Error
+    return ""
+    # accept the zombies
+    #Process.kill('TERM', pid)
+  end
+  pipe_out.close
+  return pipe_in.read
 end
 
+
diff --git a/lib/nlp_ruby/stringutil.rb b/lib/nlp_ruby/stringutil.rb
index 4091994..d7381bb 100644
--- a/lib/nlp_ruby/stringutil.rb
+++ b/lib/nlp_ruby/stringutil.rb
@@ -48,3 +48,8 @@ def read_cfg fn
   return cfg
 end
 
+def bag_of_words s, stopwords=[]
+  s.strip.split.uniq.sort.reject{ |w| stopwords.include? w }
+end 
+
+
diff --git a/lib/nlp_ruby/ttable.rb b/lib/nlp_ruby/ttable.rb
index 598e318..14d6c5d 100644
--- a/lib/nlp_ruby/ttable.rb
+++ b/lib/nlp_ruby/ttable.rb
@@ -15,18 +15,21 @@ def read_phrase_table fn
   return table
 end
 
+# FIXME
 class Translation
-  attr_accessor :id, :s, :raw, :f, :score
+  attr_accessor :id, :s, :raw, :f, :score, :rank, :other_score
 
-  def initialize id=nil, raw=nil, s=nil, f=nil, score=nil
+  def initialize id=nil, raw=nil, s=nil, f=nil, score=nil, rank=nil, other_score=nil
     @id = id
     @raw = raw
     @s = s
     @f = f
     @score = score
+    @rank = rank
+    @other_score = other_score
   end
 
-  def from_s t, strip_alignment=true
+  def from_s t, strip_alignment=true, rank=nil
     id, raw, features, score = splitpipe(t, 3)
     raw.strip!
     @raw = raw
@@ -39,10 +42,16 @@ class Translation
     @id = id.to_i
     @f = read_feature_string features
     @score = score.to_f
+    @rank = rank
+    @other_score = nil
   end
 
   def to_s
-    [id, s, f.to_kv, score].join ' ||| '
+    [@id, @s, @f.to_kv, @score].join ' ||| '
+  end
+
+  def to_s2
+    [@rank, @s, @f.to_kv, @score, @other_score].join ' ||| '
   end
 end
author	Patrick Simianer <simianer@cl.uni-heidelberg.de>	2014-02-12 17:44:06 +0100
committer	Patrick Simianer <simianer@cl.uni-heidelberg.de>	2014-02-12 17:44:06 +0100
commit	27bc315543a4e3002e5d4ec0e37be3dcc2e3114e (patch)
tree	8887ee8b5ce2bf6f6fc2c885aafef7340a265226 /lib/nlp_ruby
parent	f69ba1155e5f51dce0669bcf3e79a4c230e120d2 (diff)