=> 0.3; License and README updated; some from_* methods for SparseVector; ttable.rb => Translation.rb; moved some misc. stuff to misc.rb; monkey patched String

author: Patrick Simianer <p@simianer.de> 2014-02-14 17:14:49 +0100
committer: Patrick Simianer <p@simianer.de> 2014-02-14 17:14:49 +0100
commit: c0daa3e70cc3187f04f67c2cdc0bd3b3217e8aa6 (patch)
tree: f7030f39f20e21148ed817142eee0536e557c0c0
parent: 7255d33914122e58b031108de49918b8910eebc6 (diff)
12 files changed, 161 insertions, 191 deletions
diff --git a/LICENSE b/LICENSE
index 0d5dab3..cf4d89e 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,7 +1,23 @@
+The MIT License
+
 Copyright (C) 2014 Patrick Simianer <p ät simianer.de>
+http://simianer.de
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
 
-Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
 
-The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
 
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/README.md b/README.md
index 26858ff..5db1487 100644
--- a/README.md
+++ b/README.md
@@ -4,13 +4,13 @@ nlp_ruby
 My little NLP library, supposed to make _my_ work a little easier and less redundant.
 The .gem can be found here: https://rubygems.org/gems/nlp_ruby
 
-* dags.rb : implementation of a directed acyclic graph and various algorithms
-* fileutil.rb : file utilities
-* PriorityQueue.rb : a simple priority queue
-* semirings.rb : semirings for dags.rb
-* SparseVector.rb : sparse vectors for ruby, based on Hash
-* stringutil.rb : string utilities
-* tfidf.rb : functions to calculate tf/ntf/idf
-* ttable.rb : functions to read MT phrase tables
-* Vector.rb : vector class based on Array
+         bleu.rb : BLEU implementation, also per-sentence-BLEU
+          dag.rb : implementation of a directed acyclic graph and various algorithms
+     fileutil.rb : file utilities
+         misc.rb : misc. stuff (e.g. monkey patches for Array and String)
+    semirings.rb : semirings (used in dags.rb)
+ SparseVector.rb : sparse vectors for ruby, based on Hash class
+   stringutil.rb : string utilities
+        tfidf.rb : functions to calculate tf/ntf/idf
+  Translation.rb : an object for
 
diff --git a/lib/nlp_ruby.rb b/lib/nlp_ruby.rb
index c7af97b..a3f9f1a 100755
--- a/lib/nlp_ruby.rb
+++ b/lib/nlp_ruby.rb
@@ -3,14 +3,12 @@
 require 'nlp_ruby/stringutil'
 require 'nlp_ruby/fileutil'
 require 'nlp_ruby/SparseVector'
-require 'nlp_ruby/PriorityQueue'
 require 'nlp_ruby/tfidf'
-require 'nlp_ruby/ttable'
-require 'nlp_ruby/dags'
+require 'nlp_ruby/Translation'
+require 'nlp_ruby/dag'
 require 'nlp_ruby/semirings'
 require 'nlp_ruby/bleu'
 require 'nlp_ruby/misc'
-require 'nlp_ruby/cdec'
 
 STDIN.set_encoding 'utf-8'
 STDOUT.set_encoding 'utf-8'
diff --git a/lib/nlp_ruby/PriorityQueue.rb b/lib/nlp_ruby/PriorityQueue.rb
deleted file mode 100644
index f090e60..0000000
--- a/lib/nlp_ruby/PriorityQueue.rb
+++ /dev/null
@@ -1,37 +0,0 @@
-# FIXME dags
-# this assumes that elements in the queue
-# have a numerical member named 'score'
-class PriorityQueue
-
-  def initialize a=Array.new
-    @queue = Array.new a
-    sort!
-  end
-
-  def sort!
-    @queue.sort_by! { |i| -i.score }
-  end
-
-  def pop
-    @queue.pop
-  end
-
-  def push i
-    @queue << i
-    sort!
-  end
-
-  def empty?
-    @queue.empty?
-  end
-
-  # FIXME
-  def to_s
-    a = []
-    @queue.each { |i|
-      a << "#{i.to_s}[#{i.score}]"
-    }
-    "[#{a.join ', '}]"
-  end
-end
-
diff --git a/lib/nlp_ruby/SparseVector.rb b/lib/nlp_ruby/SparseVector.rb
index 1c0262b..b80373c 100644
--- a/lib/nlp_ruby/SparseVector.rb
+++ b/lib/nlp_ruby/SparseVector.rb
@@ -20,6 +20,34 @@ class SparseVector < Hash
     from_h eval(s)
   end
 
+  def to_kv sep='=', join=' '
+    a = []
+    self.each_pair { |k,v|
+      a << "#{k}#{sep}#{v}"
+    }
+    return a.join join
+  end
+
+  def from_kv s
+    s.split.each { |i|
+      k,v = i.split('=')
+      self[k] = v.to_f
+    }
+  end
+
+  def from_file fn, sep='='
+    f = ReadFile.new(fn)
+    while line = f.gets
+      key, value = line.strip.split sep
+      value = value.to_f
+      self[key] = value
+    end
+  end
+
+  def join_keys other
+    self.keys + other.keys
+  end
+
   def sum
     self.values.inject(:+)
   end
@@ -74,38 +102,6 @@ class SparseVector < Hash
     return Math.sqrt(sum)
   end
 
-  # FIXME
-  def from_kv_file fn, sep=' '
-    f = ReadFile.new(fn)
-    while line = f.gets
-      key, value = line.strip.split sep
-      value = value.to_f
-      self[key] = value
-    end
-  end
-  
-  # FIXME
-  def to_kv sep='='
-    a = []
-    self.each_pair { |k,v|
-      a << "#{k}#{sep}#{v}"
-    }
-    return a.join ' '
-  end
-
-  # FIXME
-  def to_kv2 sep='='
-    a = []
-    self.each_pair { |k,v|
-      a << "#{k}#{sep}#{v}"
-    }
-    return a.join "\n"
-  end
-
-  def join_keys other
-    self.keys + other.keys
-  end
-
   def + other
     new = SparseVector.new
     join_keys(other).each { |k|
@@ -132,9 +128,13 @@ class SparseVector < Hash
   end
 end
 
-def mean_sparse_vector array_of_vectors
+
+module SparseVector
+
+
+def SparseVector::mean a
   mean = SparseVector.new
-  array_of_vectors.each { |i|
+  a.each { |i|
     i.each_pair { |k,v|
       mean[k] += v
     }
@@ -144,3 +144,6 @@ def mean_sparse_vector array_of_vectors
   return mean
 end
 
+
+end
+
diff --git a/lib/nlp_ruby/ttable.rb b/lib/nlp_ruby/Translation.rb
index c0f37be..0c346a4 100644
--- a/lib/nlp_ruby/ttable.rb
+++ b/lib/nlp_ruby/Translation.rb
@@ -1,32 +1,13 @@
-# table['some French string'] = [Array of English strings]
-def read_phrase_table fn
-  table = {}
-  f = ReadFile.new fn
-  while raw_rule = f.gets
-    french, english, features = splitpipe(raw_rule)
-    feature_map = read_feature_string(features)
-    if table.has_key? french
-      table[french] << [english, feature_map ]
-    else
-      table[french] = [[english, feature_map]]
-    end
-  end
-  f.close
-  return table
-end
-
-# FIXME
 class Translation
   attr_accessor :id, :s, :raw, :f, :score, :rank, :other_score
 
-  def initialize id=nil, raw=nil, s=nil, f=nil, score=nil, rank=nil, other_score=nil
+  def initialize id=nil, raw=nil, s=nil, f=nil, scores={}, rank=nil
     @id = id
     @raw = raw
     @s = s
     @f = f
-    @score = score
+    @scores = scores
     @rank = rank
-    @other_score = other_score
   end
 
   def from_s t, strip_alignment=true, rank=nil
@@ -41,17 +22,17 @@ class Translation
     end
     @id = id.to_i
     @f = read_feature_string features
-    @score = score.to_f
+    @scores['decoder'] = score.to_f
     @rank = rank
-    @other_score = nil
   end
 
-  def to_s
-    [@id, @s, @f.to_kv, @score].join ' ||| '
+  def to_s include_features=true
+    [@id, @s, @f.to_kv('=', ' '), @scores['decoder']].join(' ||| ') if include_features
+    [@id, @s, @scores['decoder']].join(' ||| ') if !include_features
   end
 
   def to_s2
-    [@rank, @s, @score, @other_score].join ' ||| '
+    [@rank, @s, @score, @scores.to_s].join ' ||| '
   end
 end
 
diff --git a/lib/nlp_ruby/bleu.rb b/lib/nlp_ruby/bleu.rb
index ee91985..d7a6b2b 100644
--- a/lib/nlp_ruby/bleu.rb
+++ b/lib/nlp_ruby/bleu.rb
@@ -79,9 +79,9 @@ def BLEU::get_counts hypothesis, reference, n, times=1
   return p
 end
 
-def BLEU::brevity_penalty c, r
+def BLEU::brevity_penalty c, r, hack=0.0
   return 1.0 if c>r
-  return Math.exp(1-r/c)
+  return Math.exp 1.0-((r+hack)/c)
 end
 
 def BLEU::bleu counts, n, debug=false
@@ -105,7 +105,7 @@ def BLEU::hbleu counts, n, debug=false
   (100*bleu(counts, n, debug)).round(3)
 end
 
-def BLEU::per_sentence_bleu hypothesis, reference, n=4
+def BLEU::per_sentence_bleu hypothesis, reference, n=4, hack=0.0
   h_ng = {}; r_ng = {}
   (1).upto(n) {|i| h_ng[i] = []; r_ng[i] = []}
   ngrams(hypothesis, n) {|i| h_ng[i.size] << i}
@@ -117,13 +117,13 @@ def BLEU::per_sentence_bleu hypothesis, reference, n=4
   (1).upto(m) { |i|
     counts_clipped = 0
     counts_sum = h_ng[i].size
-    h_ng[i].uniq.each {|j| counts_clipped += r_ng[i].count(j)}
+    h_ng[i].uniq.each { |j| counts_clipped += r_ng[i].count(j) }
     add = 1.0 if i >= 2
     sum += weight * Math.log((counts_clipped + add)/(counts_sum + add));
-  } 
+  }
   return brevity_penalty(hypothesis.strip.split.size, reference.strip.split.size) * Math.exp(sum)
 end
 
 
-end
+end # module
 
diff --git a/lib/nlp_ruby/cdec.rb b/lib/nlp_ruby/cdec.rb
deleted file mode 100644
index 1080f14..0000000
--- a/lib/nlp_ruby/cdec.rb
+++ /dev/null
@@ -1,20 +0,0 @@
-module CDEC
-
-require 'open3'
-
-
-# FIXME
-CDEC_BINARY = "/toolbox/cdec-dtrain/decoder/cdec"
-
-
-def CDEC::kbest input, ini, weights, k, unique=true
-  o, s = Open3.capture2 "echo \"#{input}\" | #{CDEC_BINARY} -c #{ini} -w #{weights} -k #{k} -r  2>/dev/null"
-  j = -1
-  ret = []
-  o.split("\n").map{|i| j+=1; t=Translation.new; t.from_s(i, false, j); ret << t}
-  return ret
-end
-
-
-end
-
diff --git a/lib/nlp_ruby/dags.rb b/lib/nlp_ruby/dag.rb
index 7767be1..cca35c5 100644
--- a/lib/nlp_ruby/dags.rb
+++ b/lib/nlp_ruby/dag.rb
@@ -1,21 +1,8 @@
-###########################
-# TODO
-# output paths
-# visualization?
-# algorithms:
-#  beam search
-#  best-first
-#  kbest
-#  kruskal (MST)?
-#  transitive closure?
-###########################
+module DAG
 
 require 'json'
 
 
-module DAG
-
-
 class DAG::Node
   attr_accessor :label, :edges, :incoming, :score, :mark
 
diff --git a/lib/nlp_ruby/misc.rb b/lib/nlp_ruby/misc.rb
index 80d932c..0f58100 100644
--- a/lib/nlp_ruby/misc.rb
+++ b/lib/nlp_ruby/misc.rb
@@ -21,6 +21,40 @@ class Array
   end
 end
 
+class String
+
+  def downcase? s
+    s[/[[:lower:]]/]
+  end
+end
+
+class PriorityQueue
+# This assumes that elements in the queue
+# have a numerical member named 'score'.
+
+  def initialize a=Array.new
+    @queue = Array.new a
+    sort!
+  end
+
+  def sort!
+    @queue.sort_by! { |i| -i.score }
+  end
+
+  def pop
+    @queue.pop
+  end
+
+  def push i
+    @queue << i
+    sort!
+  end
+
+  def empty?
+    @queue.empty?
+  end
+end
+
 def spawn_with_timeout cmd, t=4, debug=false
   require 'timeout'
   STDERR.write cmd+"\n" if debug
@@ -37,4 +71,44 @@ def spawn_with_timeout cmd, t=4, debug=false
   return pipe_in.read
 end
 
+def read_phrase_table fn
+  table = {}
+  f = ReadFile.new fn
+  while raw_rule = f.gets
+    french, english, features = splitpipe(raw_rule)
+    feature_map = read_feature_string(features)
+    if table.has_key? french
+      table[french] << [english, feature_map ]
+    else
+      table[french] = [[english, feature_map]]
+    end
+  end
+  f.close
+  return table
+end
+
+def cdec_kbest cdec_bin, input, ini, weights, k, unique=true
+  require 'open3'
+  cmd = "echo \"#{input}\" | #{cdec_bin} -c #{ini} -w #{weights} -k #{k}"
+  cmd += " -r" if unique
+  o,_ = Open3.capture2 "#{cmd}  2>/dev/null"
+  a = []; j = -1
+  o.split("\n").map{ |i| j+=1; t=Translation.new; t.from_s(i, false, j); a << t }
+  return a
+end
+
+def read_config fn
+  f = ReadFile.new fn
+  cfg = {}
+  while line = f.gets
+    line.strip!
+    next if /^\s*$/.match line
+    next if line[0]=='#'
+    content = line.split('#', 2).first
+    k, v = content.split(/\s*=\s*/, 2)
+    k.strip!; v.strip!
+    cfg[k] = v
+  end
+  return cfg
+end
 
diff --git a/lib/nlp_ruby/semirings.rb b/lib/nlp_ruby/semirings.rb
index a06f151..83551a9 100644
--- a/lib/nlp_ruby/semirings.rb
+++ b/lib/nlp_ruby/semirings.rb
@@ -1,4 +1,5 @@
-# semirings for graphs as described in
+# Semirings for directed acyclic graphs (dags) (also directed hypergraphs),
+# as described in:
 # 'Dynamic Programming Algorithms in
 #  Semiring and Hypergraph Frameworks' (Liang Huang)
 class Semiring
diff --git a/lib/nlp_ruby/stringutil.rb b/lib/nlp_ruby/stringutil.rb
index d7381bb..aa9be00 100644
--- a/lib/nlp_ruby/stringutil.rb
+++ b/lib/nlp_ruby/stringutil.rb
@@ -1,17 +1,7 @@
-# whitespace 'tokenizer'
 def tokenize s
   s.strip.split
 end
 
-def splitpipe s, n=3
-  s.strip.split("|"*n)
-end
-
-def downcase? s
-  s[/[[:lower:]]/]
-end
-
-# iterator over n-grams
 def ngrams(s, n, fix=false)
   a = tokenize s
   a.each_with_index { |tok, i|
@@ -22,34 +12,11 @@ def ngrams(s, n, fix=false)
   }
 end
 
-# a=1.0 b=2.0 => { 'a' => 1.0, 'b' => 2.0 }
-def read_feature_string s
-  map = SparseVector.new
-  tokenize(s).each { |i|
-    key, value = i.split '='
-    map[key] = value.to_f
-  }
-  return map
-end
-
-
-def read_cfg fn
-  f = ReadFile.new fn
-  cfg = {}
-  while line = f.gets
-    line.strip!
-    next if /^\s*$/.match line
-    next if line[0]=='#'
-    content = line.split('#', 2).first
-    k, v = content.split(/\s*=\s*/, 2)
-    k.strip!; v.strip!
-    cfg[k] = v
-  end
-  return cfg
-end
-
 def bag_of_words s, stopwords=[]
   s.strip.split.uniq.sort.reject{ |w| stopwords.include? w }
-end 
+end
 
+def splitpipe s, n=3
+  s.strip.split("|"*n)
+end
author	Patrick Simianer <p@simianer.de>	2014-02-14 17:14:49 +0100
committer	Patrick Simianer <p@simianer.de>	2014-02-14 17:14:49 +0100
commit	c0daa3e70cc3187f04f67c2cdc0bd3b3217e8aa6 (patch)
tree	f7030f39f20e21148ed817142eee0536e557c0c0
parent	7255d33914122e58b031108de49918b8910eebc6 (diff)