From c0daa3e70cc3187f04f67c2cdc0bd3b3217e8aa6 Mon Sep 17 00:00:00 2001
From: Patrick Simianer <p@simianer.de>
Date: Fri, 14 Feb 2014 17:14:49 +0100
Subject: => 0.3; License and README updated; some from_* methods for
 SparseVector; ttable.rb => Translation.rb; moved some misc. stuff to misc.rb;
 monkey patched String

---
 lib/nlp_ruby/PriorityQueue.rb |  37 -------
 lib/nlp_ruby/SparseVector.rb  |  71 +++++++-------
 lib/nlp_ruby/Translation.rb   |  66 +++++++++++++
 lib/nlp_ruby/bleu.rb          |  12 +--
 lib/nlp_ruby/cdec.rb          |  20 ----
 lib/nlp_ruby/dag.rb           | 205 +++++++++++++++++++++++++++++++++++++++
 lib/nlp_ruby/dags.rb          | 218 ------------------------------------------
 lib/nlp_ruby/misc.rb          |  74 ++++++++++++++
 lib/nlp_ruby/semirings.rb     |   3 +-
 lib/nlp_ruby/stringutil.rb    |  41 +-------
 lib/nlp_ruby/ttable.rb        |  85 ----------------
 11 files changed, 394 insertions(+), 438 deletions(-)
 delete mode 100644 lib/nlp_ruby/PriorityQueue.rb
 create mode 100644 lib/nlp_ruby/Translation.rb
 delete mode 100644 lib/nlp_ruby/cdec.rb
 create mode 100644 lib/nlp_ruby/dag.rb
 delete mode 100644 lib/nlp_ruby/dags.rb
 delete mode 100644 lib/nlp_ruby/ttable.rb

(limited to 'lib/nlp_ruby')

diff --git a/lib/nlp_ruby/PriorityQueue.rb b/lib/nlp_ruby/PriorityQueue.rb
deleted file mode 100644
index f090e60..0000000
--- a/lib/nlp_ruby/PriorityQueue.rb
+++ /dev/null
@@ -1,37 +0,0 @@
-# FIXME dags
-# this assumes that elements in the queue
-# have a numerical member named 'score'
-class PriorityQueue
-
-  def initialize a=Array.new
-    @queue = Array.new a
-    sort!
-  end
-
-  def sort!
-    @queue.sort_by! { |i| -i.score }
-  end
-
-  def pop
-    @queue.pop
-  end
-
-  def push i
-    @queue << i
-    sort!
-  end
-
-  def empty?
-    @queue.empty?
-  end
-
-  # FIXME
-  def to_s
-    a = []
-    @queue.each { |i|
-      a << "#{i.to_s}[#{i.score}]"
-    }
-    "[#{a.join ', '}]"
-  end
-end
-
diff --git a/lib/nlp_ruby/SparseVector.rb b/lib/nlp_ruby/SparseVector.rb
index 1c0262b..b80373c 100644
--- a/lib/nlp_ruby/SparseVector.rb
+++ b/lib/nlp_ruby/SparseVector.rb
@@ -20,6 +20,34 @@ class SparseVector < Hash
     from_h eval(s)
   end
 
+  def to_kv sep='=', join=' '
+    a = []
+    self.each_pair { |k,v|
+      a << "#{k}#{sep}#{v}"
+    }
+    return a.join join
+  end
+
+  def from_kv s
+    s.split.each { |i|
+      k,v = i.split('=')
+      self[k] = v.to_f
+    }
+  end
+
+  def from_file fn, sep='='
+    f = ReadFile.new(fn)
+    while line = f.gets
+      key, value = line.strip.split sep
+      value = value.to_f
+      self[key] = value
+    end
+  end
+
+  def join_keys other
+    self.keys + other.keys
+  end
+
   def sum
     self.values.inject(:+)
   end
@@ -74,38 +102,6 @@ class SparseVector < Hash
     return Math.sqrt(sum)
   end
 
-  # FIXME
-  def from_kv_file fn, sep=' '
-    f = ReadFile.new(fn)
-    while line = f.gets
-      key, value = line.strip.split sep
-      value = value.to_f
-      self[key] = value
-    end
-  end
-  
-  # FIXME
-  def to_kv sep='='
-    a = []
-    self.each_pair { |k,v|
-      a << "#{k}#{sep}#{v}"
-    }
-    return a.join ' '
-  end
-
-  # FIXME
-  def to_kv2 sep='='
-    a = []
-    self.each_pair { |k,v|
-      a << "#{k}#{sep}#{v}"
-    }
-    return a.join "\n"
-  end
-
-  def join_keys other
-    self.keys + other.keys
-  end
-
   def + other
     new = SparseVector.new
     join_keys(other).each { |k|
@@ -132,9 +128,13 @@ class SparseVector < Hash
   end
 end
 
-def mean_sparse_vector array_of_vectors
+
+module SparseVector
+
+
+def SparseVector::mean a
   mean = SparseVector.new
-  array_of_vectors.each { |i|
+  a.each { |i|
     i.each_pair { |k,v|
       mean[k] += v
     }
@@ -144,3 +144,6 @@ def mean_sparse_vector array_of_vectors
   return mean
 end
 
+
+end
+
diff --git a/lib/nlp_ruby/Translation.rb b/lib/nlp_ruby/Translation.rb
new file mode 100644
index 0000000..0c346a4
--- /dev/null
+++ b/lib/nlp_ruby/Translation.rb
@@ -0,0 +1,66 @@
+class Translation
+  attr_accessor :id, :s, :raw, :f, :score, :rank, :other_score
+
+  def initialize id=nil, raw=nil, s=nil, f=nil, scores={}, rank=nil
+    @id = id
+    @raw = raw
+    @s = s
+    @f = f
+    @scores = scores
+    @rank = rank
+  end
+
+  def from_s t, strip_alignment=true, rank=nil
+    id, raw, features, score = splitpipe(t, 3)
+    raw.strip!
+    @raw = raw
+    if strip_alignment # the way moses does it
+      @s = @raw.gsub(/\s*\|\d+-\d+\||\|-?\d+\|\s*/, ' ').gsub(/\s+/, ' ')
+      @s.strip!
+    else
+      @s = raw
+    end
+    @id = id.to_i
+    @f = read_feature_string features
+    @scores['decoder'] = score.to_f
+    @rank = rank
+  end
+
+  def to_s include_features=true
+    [@id, @s, @f.to_kv('=', ' '), @scores['decoder']].join(' ||| ') if include_features
+    [@id, @s, @scores['decoder']].join(' ||| ') if !include_features
+  end
+
+  def to_s2
+    [@rank, @s, @score, @scores.to_s].join ' ||| '
+  end
+end
+
+def read_kbest_lists fn, translation_type=Translation
+  kbest_lists = []
+  cur = []
+  f = ReadFile.new fn
+  prev = -1
+  c = 0
+  id = 0
+  while line = f.gets
+    t = translation_type.new
+    t.from_s line
+    c = splitpipe(line)[0].to_i
+    if c != prev
+      if cur.size > 0
+        kbest_lists << cur
+        cur = []
+      end
+      prev = c
+      id = 0
+    end
+    t.id = id
+    cur << t
+    id += 1
+  end
+  kbest_lists << cur # last one
+  f.close
+  return kbest_lists
+end
+
diff --git a/lib/nlp_ruby/bleu.rb b/lib/nlp_ruby/bleu.rb
index ee91985..d7a6b2b 100644
--- a/lib/nlp_ruby/bleu.rb
+++ b/lib/nlp_ruby/bleu.rb
@@ -79,9 +79,9 @@ def BLEU::get_counts hypothesis, reference, n, times=1
   return p
 end
 
-def BLEU::brevity_penalty c, r
+def BLEU::brevity_penalty c, r, hack=0.0
   return 1.0 if c>r
-  return Math.exp(1-r/c)
+  return Math.exp 1.0-((r+hack)/c)
 end
 
 def BLEU::bleu counts, n, debug=false
@@ -105,7 +105,7 @@ def BLEU::hbleu counts, n, debug=false
   (100*bleu(counts, n, debug)).round(3)
 end
 
-def BLEU::per_sentence_bleu hypothesis, reference, n=4
+def BLEU::per_sentence_bleu hypothesis, reference, n=4, hack=0.0
   h_ng = {}; r_ng = {}
   (1).upto(n) {|i| h_ng[i] = []; r_ng[i] = []}
   ngrams(hypothesis, n) {|i| h_ng[i.size] << i}
@@ -117,13 +117,13 @@ def BLEU::per_sentence_bleu hypothesis, reference, n=4
   (1).upto(m) { |i|
     counts_clipped = 0
     counts_sum = h_ng[i].size
-    h_ng[i].uniq.each {|j| counts_clipped += r_ng[i].count(j)}
+    h_ng[i].uniq.each { |j| counts_clipped += r_ng[i].count(j) }
     add = 1.0 if i >= 2
     sum += weight * Math.log((counts_clipped + add)/(counts_sum + add));
-  } 
+  }
   return brevity_penalty(hypothesis.strip.split.size, reference.strip.split.size) * Math.exp(sum)
 end
 
 
-end
+end # module
 
diff --git a/lib/nlp_ruby/cdec.rb b/lib/nlp_ruby/cdec.rb
deleted file mode 100644
index 1080f14..0000000
--- a/lib/nlp_ruby/cdec.rb
+++ /dev/null
@@ -1,20 +0,0 @@
-module CDEC
-
-require 'open3'
-
-
-# FIXME
-CDEC_BINARY = "/toolbox/cdec-dtrain/decoder/cdec"
-
-
-def CDEC::kbest input, ini, weights, k, unique=true
-  o, s = Open3.capture2 "echo \"#{input}\" | #{CDEC_BINARY} -c #{ini} -w #{weights} -k #{k} -r  2>/dev/null"
-  j = -1
-  ret = []
-  o.split("\n").map{|i| j+=1; t=Translation.new; t.from_s(i, false, j); ret << t}
-  return ret
-end
-
-
-end
-
diff --git a/lib/nlp_ruby/dag.rb b/lib/nlp_ruby/dag.rb
new file mode 100644
index 0000000..cca35c5
--- /dev/null
+++ b/lib/nlp_ruby/dag.rb
@@ -0,0 +1,205 @@
+module DAG
+
+require 'json'
+
+
+class DAG::Node
+  attr_accessor :label, :edges, :incoming, :score, :mark
+
+  def initialize label=nil, edges=[], incoming=[], score=nil
+    @label    = label
+    @edges    = edges # outgoing
+    @incoming = incoming
+    @score    = nil
+  end
+
+  def add_edge head, weight=0
+    exit if self==head # no self-cycles!
+    @edges << DAG::Edge.new(self, head, weight)
+    return @edges.last
+  end
+
+  def to_s
+    "DAG::Node<label:#{label}, edges:#{edges.size}, incoming:#{incoming.size}>"
+  end
+
+  def repr
+    "#{to_s} #{@score} out:#{@edges} in:[#{@incoming.map{|e| e.to_s}.join ', '}]"
+  end
+end
+
+class DAG::Edge
+  attr_accessor :tail, :head, :weight, :mark
+
+  def initialize tail=nil, head=nil, weight=0
+    @tail   = tail
+    @head   = head
+    @weight = weight
+    @mark   = false # did we already follow this edge? -- for topological sorting
+  end
+
+  def to_s
+    s = "DAG::Edge<#{@tail} ->[#{weight}] #{@head}"
+    s += " x" if @mark
+    s += ">"
+    s
+  end
+end
+
+# depth-first search
+#  w/o markings as we do not have cycles
+def DAG::dfs n, target_label
+  return n if n.label==target_label # assumes uniq labels!
+  stack = n.edges.map { |i| i.head }
+  while !stack.empty?
+    m = stack.pop
+    return DAG::dfs m, target_label
+  end
+  return nil
+end
+
+# breadth-first search
+#  w/o markings as we do not have cycles
+def DAG::bfs n, target_label
+  queue = [n]
+  while !queue.empty?
+    m = queue.shift
+    return m if m.label==target_label
+    m.edges.each { |e| queue << e.head }
+  end
+  return nil
+end
+
+# topological sort
+def DAG::topological_sort graph
+  sorted = []
+  s = graph.reject { |n| !n.incoming.empty? }
+  while !s.empty?
+    sorted << s.shift
+    sorted.last.edges.each { |e|
+      e.mark = true
+      s << e.head if e.head.incoming.reject{|f| f.mark}.empty?
+    }
+  end
+  return sorted
+end
+
+# initialize graph scores with semiring One
+def DAG::init graph, semiring, source_node
+  graph.each {|n| n.score=semiring.null}
+  source_node.score = semiring.one
+end
+
+# viterbi
+def DAG::viterbi graph, semiring=ViterbiSemiring, source_node
+  toposorted = DAG::topological_sort(graph)
+  DAG::init(graph, semiring, source_node)
+  toposorted.each { |n|
+    n.incoming.each { |e|
+      # update
+      n.score = \
+        semiring.add.call(n.score, \
+                          semiring.multiply.call(e.tail.score, e.weight)
+        )
+    }
+  }
+end
+
+# forward viterbi
+def DAG::viterbi_forward graph, semiring=ViterbiSemiring, source_node
+  toposorted = DAG::topological_sort(graph)
+  DAG::init(graph, semiring, source_node)
+  toposorted.each { |n|
+    n.edges.each { |e|
+      e.head.score = \
+        semiring.add.call(e.head.score, \
+                          semiring.multiply.call(n.score, e.weight)
+        )
+    }
+  }
+end
+
+# Dijkstra algorithm
+#  for A*-search we would need an optimistic estimate of
+#  future cost at each node
+def DAG::dijkstra graph, semiring=RealSemiring.new, source_node
+  DAG::init(graph, semiring, source_node)
+  q = PriorityQueue.new graph
+  while !q.empty?
+    n = q.pop
+    n.edges.each { |e|
+      e.head.score = \
+        semiring.add.call(e.head.score, \
+                          semiring.multiply.call(n.score, e.weight))
+      q.sort!
+    }
+  end
+end
+
+# Bellman-Ford algorithm
+def DAG::bellman_ford(graph, semiring=RealSemiring.new, source_node)
+  DAG::init(graph, semiring, source_node)
+  edges = []
+  graph.each { |n| edges |= n.edges }
+  # relax edges
+  (graph.size-1).times{ |i|
+    edges.each { |e|
+      e.head.score = \
+        semiring.add.call(e.head.score, \
+                          semiring.multiply.call(e.tail.score, e.weight))
+    }
+  }
+  # we do not allow cycles (negative or positive)
+end
+
+# Floyd algorithm
+def DAG::floyd(graph, semiring=nil)
+  dist_matrix = []
+  graph.each_index { |i|
+    dist_matrix << []
+    graph.each_index { |j|
+      val = 1.0/0.0
+      val = 0.0 if i==j
+      dist_matrix.last << val
+    }
+  }
+  edges = []
+  graph.each { |n| edges |= n.edges }
+  edges.each { |e|
+    dist_matrix[graph.index(e.tail)][graph.index(e.head)] = e.weight
+  }
+  0.upto(graph.size-1) { |k|
+    0.upto(graph.size-1) { |i|
+      0.upto(graph.size-1) { |j|
+        if dist_matrix[i][k] + dist_matrix[k][j] < dist_matrix[i][j]
+          dist_matrix  [i][j] = dist_matrix[i][k] + dist_matrix[k][j]
+        end
+      }
+    }
+  }
+  return dist_matrix
+end
+
+
+# returns a list of nodes (graph) and a hash for finding
+# nodes by their label (these need to be unique!)
+def DAG::read_graph_from_json fn, semiring=RealSemiring.new
+  graph = []
+  nodes_by_label = {}
+  h = JSON.parse File.new(fn).read
+  h['nodes'].each { |i|
+    n = DAG::Node.new i['label']
+    graph << n
+    nodes_by_label[n.label] = n
+  }
+  h['edges'].each { |i|
+    n = nodes_by_label[i['tail']]
+    a = n.add_edge(nodes_by_label[i['head']], semiring.convert.call(i['weight'].to_f))
+    nodes_by_label[i['head']].incoming << a
+  }
+  return graph, nodes_by_label
+end
+
+
+end # module
+
diff --git a/lib/nlp_ruby/dags.rb b/lib/nlp_ruby/dags.rb
deleted file mode 100644
index 7767be1..0000000
--- a/lib/nlp_ruby/dags.rb
+++ /dev/null
@@ -1,218 +0,0 @@
-###########################
-# TODO
-# output paths
-# visualization?
-# algorithms:
-#  beam search
-#  best-first
-#  kbest
-#  kruskal (MST)?
-#  transitive closure?
-###########################
-
-require 'json'
-
-
-module DAG
-
-
-class DAG::Node
-  attr_accessor :label, :edges, :incoming, :score, :mark
-
-  def initialize label=nil, edges=[], incoming=[], score=nil
-    @label    = label
-    @edges    = edges # outgoing
-    @incoming = incoming
-    @score    = nil
-  end
-
-  def add_edge head, weight=0
-    exit if self==head # no self-cycles!
-    @edges << DAG::Edge.new(self, head, weight)
-    return @edges.last
-  end
-
-  def to_s
-    "DAG::Node<label:#{label}, edges:#{edges.size}, incoming:#{incoming.size}>"
-  end
-
-  def repr
-    "#{to_s} #{@score} out:#{@edges} in:[#{@incoming.map{|e| e.to_s}.join ', '}]"
-  end
-end
-
-class DAG::Edge
-  attr_accessor :tail, :head, :weight, :mark
-
-  def initialize tail=nil, head=nil, weight=0
-    @tail   = tail
-    @head   = head
-    @weight = weight
-    @mark   = false # did we already follow this edge? -- for topological sorting
-  end
-
-  def to_s
-    s = "DAG::Edge<#{@tail} ->[#{weight}] #{@head}"
-    s += " x" if @mark
-    s += ">"
-    s
-  end
-end
-
-# depth-first search
-#  w/o markings as we do not have cycles
-def DAG::dfs n, target_label
-  return n if n.label==target_label # assumes uniq labels!
-  stack = n.edges.map { |i| i.head }
-  while !stack.empty?
-    m = stack.pop
-    return DAG::dfs m, target_label
-  end
-  return nil
-end
-
-# breadth-first search
-#  w/o markings as we do not have cycles
-def DAG::bfs n, target_label
-  queue = [n]
-  while !queue.empty?
-    m = queue.shift
-    return m if m.label==target_label
-    m.edges.each { |e| queue << e.head }
-  end
-  return nil
-end
-
-# topological sort
-def DAG::topological_sort graph
-  sorted = []
-  s = graph.reject { |n| !n.incoming.empty? }
-  while !s.empty?
-    sorted << s.shift
-    sorted.last.edges.each { |e|
-      e.mark = true
-      s << e.head if e.head.incoming.reject{|f| f.mark}.empty?
-    }
-  end
-  return sorted
-end
-
-# initialize graph scores with semiring One
-def DAG::init graph, semiring, source_node
-  graph.each {|n| n.score=semiring.null}
-  source_node.score = semiring.one
-end
-
-# viterbi
-def DAG::viterbi graph, semiring=ViterbiSemiring, source_node
-  toposorted = DAG::topological_sort(graph)
-  DAG::init(graph, semiring, source_node)
-  toposorted.each { |n|
-    n.incoming.each { |e|
-      # update
-      n.score = \
-        semiring.add.call(n.score, \
-                          semiring.multiply.call(e.tail.score, e.weight)
-        )
-    }
-  }
-end
-
-# forward viterbi
-def DAG::viterbi_forward graph, semiring=ViterbiSemiring, source_node
-  toposorted = DAG::topological_sort(graph)
-  DAG::init(graph, semiring, source_node)
-  toposorted.each { |n|
-    n.edges.each { |e|
-      e.head.score = \
-        semiring.add.call(e.head.score, \
-                          semiring.multiply.call(n.score, e.weight)
-        )
-    }
-  }
-end
-
-# Dijkstra algorithm
-#  for A*-search we would need an optimistic estimate of
-#  future cost at each node
-def DAG::dijkstra graph, semiring=RealSemiring.new, source_node
-  DAG::init(graph, semiring, source_node)
-  q = PriorityQueue.new graph
-  while !q.empty?
-    n = q.pop
-    n.edges.each { |e|
-      e.head.score = \
-        semiring.add.call(e.head.score, \
-                          semiring.multiply.call(n.score, e.weight))
-      q.sort!
-    }
-  end
-end
-
-# Bellman-Ford algorithm
-def DAG::bellman_ford(graph, semiring=RealSemiring.new, source_node)
-  DAG::init(graph, semiring, source_node)
-  edges = []
-  graph.each { |n| edges |= n.edges }
-  # relax edges
-  (graph.size-1).times{ |i|
-    edges.each { |e|
-      e.head.score = \
-        semiring.add.call(e.head.score, \
-                          semiring.multiply.call(e.tail.score, e.weight))
-    }
-  }
-  # we do not allow cycles (negative or positive)
-end
-
-# Floyd algorithm
-def DAG::floyd(graph, semiring=nil)
-  dist_matrix = []
-  graph.each_index { |i|
-    dist_matrix << []
-    graph.each_index { |j|
-      val = 1.0/0.0
-      val = 0.0 if i==j
-      dist_matrix.last << val
-    }
-  }
-  edges = []
-  graph.each { |n| edges |= n.edges }
-  edges.each { |e|
-    dist_matrix[graph.index(e.tail)][graph.index(e.head)] = e.weight
-  }
-  0.upto(graph.size-1) { |k|
-    0.upto(graph.size-1) { |i|
-      0.upto(graph.size-1) { |j|
-        if dist_matrix[i][k] + dist_matrix[k][j] < dist_matrix[i][j]
-          dist_matrix  [i][j] = dist_matrix[i][k] + dist_matrix[k][j]
-        end
-      }
-    }
-  }
-  return dist_matrix
-end
-
-
-# returns a list of nodes (graph) and a hash for finding
-# nodes by their label (these need to be unique!)
-def DAG::read_graph_from_json fn, semiring=RealSemiring.new
-  graph = []
-  nodes_by_label = {}
-  h = JSON.parse File.new(fn).read
-  h['nodes'].each { |i|
-    n = DAG::Node.new i['label']
-    graph << n
-    nodes_by_label[n.label] = n
-  }
-  h['edges'].each { |i|
-    n = nodes_by_label[i['tail']]
-    a = n.add_edge(nodes_by_label[i['head']], semiring.convert.call(i['weight'].to_f))
-    nodes_by_label[i['head']].incoming << a
-  }
-  return graph, nodes_by_label
-end
-
-
-end # module
-
diff --git a/lib/nlp_ruby/misc.rb b/lib/nlp_ruby/misc.rb
index 80d932c..0f58100 100644
--- a/lib/nlp_ruby/misc.rb
+++ b/lib/nlp_ruby/misc.rb
@@ -21,6 +21,40 @@ class Array
   end
 end
 
+class String
+
+  def downcase? s
+    s[/[[:lower:]]/]
+  end
+end
+
+class PriorityQueue
+# This assumes that elements in the queue
+# have a numerical member named 'score'.
+
+  def initialize a=Array.new
+    @queue = Array.new a
+    sort!
+  end
+
+  def sort!
+    @queue.sort_by! { |i| -i.score }
+  end
+
+  def pop
+    @queue.pop
+  end
+
+  def push i
+    @queue << i
+    sort!
+  end
+
+  def empty?
+    @queue.empty?
+  end
+end
+
 def spawn_with_timeout cmd, t=4, debug=false
   require 'timeout'
   STDERR.write cmd+"\n" if debug
@@ -37,4 +71,44 @@ def spawn_with_timeout cmd, t=4, debug=false
   return pipe_in.read
 end
 
+def read_phrase_table fn
+  table = {}
+  f = ReadFile.new fn
+  while raw_rule = f.gets
+    french, english, features = splitpipe(raw_rule)
+    feature_map = read_feature_string(features)
+    if table.has_key? french
+      table[french] << [english, feature_map ]
+    else
+      table[french] = [[english, feature_map]]
+    end
+  end
+  f.close
+  return table
+end
+
+def cdec_kbest cdec_bin, input, ini, weights, k, unique=true
+  require 'open3'
+  cmd = "echo \"#{input}\" | #{cdec_bin} -c #{ini} -w #{weights} -k #{k}"
+  cmd += " -r" if unique
+  o,_ = Open3.capture2 "#{cmd}  2>/dev/null"
+  a = []; j = -1
+  o.split("\n").map{ |i| j+=1; t=Translation.new; t.from_s(i, false, j); a << t }
+  return a
+end
+
+def read_config fn
+  f = ReadFile.new fn
+  cfg = {}
+  while line = f.gets
+    line.strip!
+    next if /^\s*$/.match line
+    next if line[0]=='#'
+    content = line.split('#', 2).first
+    k, v = content.split(/\s*=\s*/, 2)
+    k.strip!; v.strip!
+    cfg[k] = v
+  end
+  return cfg
+end
 
diff --git a/lib/nlp_ruby/semirings.rb b/lib/nlp_ruby/semirings.rb
index a06f151..83551a9 100644
--- a/lib/nlp_ruby/semirings.rb
+++ b/lib/nlp_ruby/semirings.rb
@@ -1,4 +1,5 @@
-# semirings for graphs as described in
+# Semirings for directed acyclic graphs (dags) (also directed hypergraphs),
+# as described in:
 # 'Dynamic Programming Algorithms in
 #  Semiring and Hypergraph Frameworks' (Liang Huang)
 class Semiring
diff --git a/lib/nlp_ruby/stringutil.rb b/lib/nlp_ruby/stringutil.rb
index d7381bb..aa9be00 100644
--- a/lib/nlp_ruby/stringutil.rb
+++ b/lib/nlp_ruby/stringutil.rb
@@ -1,17 +1,7 @@
-# whitespace 'tokenizer'
 def tokenize s
   s.strip.split
 end
 
-def splitpipe s, n=3
-  s.strip.split("|"*n)
-end
-
-def downcase? s
-  s[/[[:lower:]]/]
-end
-
-# iterator over n-grams
 def ngrams(s, n, fix=false)
   a = tokenize s
   a.each_with_index { |tok, i|
@@ -22,34 +12,11 @@ def ngrams(s, n, fix=false)
   }
 end
 
-# a=1.0 b=2.0 => { 'a' => 1.0, 'b' => 2.0 }
-def read_feature_string s
-  map = SparseVector.new
-  tokenize(s).each { |i|
-    key, value = i.split '='
-    map[key] = value.to_f
-  }
-  return map
-end
-
-
-def read_cfg fn
-  f = ReadFile.new fn
-  cfg = {}
-  while line = f.gets
-    line.strip!
-    next if /^\s*$/.match line
-    next if line[0]=='#'
-    content = line.split('#', 2).first
-    k, v = content.split(/\s*=\s*/, 2)
-    k.strip!; v.strip!
-    cfg[k] = v
-  end
-  return cfg
-end
-
 def bag_of_words s, stopwords=[]
   s.strip.split.uniq.sort.reject{ |w| stopwords.include? w }
-end 
+end
 
+def splitpipe s, n=3
+  s.strip.split("|"*n)
+end
 
diff --git a/lib/nlp_ruby/ttable.rb b/lib/nlp_ruby/ttable.rb
deleted file mode 100644
index c0f37be..0000000
--- a/lib/nlp_ruby/ttable.rb
+++ /dev/null
@@ -1,85 +0,0 @@
-# table['some French string'] = [Array of English strings]
-def read_phrase_table fn
-  table = {}
-  f = ReadFile.new fn
-  while raw_rule = f.gets
-    french, english, features = splitpipe(raw_rule)
-    feature_map = read_feature_string(features)
-    if table.has_key? french
-      table[french] << [english, feature_map ]
-    else
-      table[french] = [[english, feature_map]]
-    end
-  end
-  f.close
-  return table
-end
-
-# FIXME
-class Translation
-  attr_accessor :id, :s, :raw, :f, :score, :rank, :other_score
-
-  def initialize id=nil, raw=nil, s=nil, f=nil, score=nil, rank=nil, other_score=nil
-    @id = id
-    @raw = raw
-    @s = s
-    @f = f
-    @score = score
-    @rank = rank
-    @other_score = other_score
-  end
-
-  def from_s t, strip_alignment=true, rank=nil
-    id, raw, features, score = splitpipe(t, 3)
-    raw.strip!
-    @raw = raw
-    if strip_alignment # the way moses does it
-      @s = @raw.gsub(/\s*\|\d+-\d+\||\|-?\d+\|\s*/, ' ').gsub(/\s+/, ' ')
-      @s.strip!
-    else
-      @s = raw
-    end
-    @id = id.to_i
-    @f = read_feature_string features
-    @score = score.to_f
-    @rank = rank
-    @other_score = nil
-  end
-
-  def to_s
-    [@id, @s, @f.to_kv, @score].join ' ||| '
-  end
-
-  def to_s2
-    [@rank, @s, @score, @other_score].join ' ||| '
-  end
-end
-
-def read_kbest_lists fn, translation_type=Translation
-  kbest_lists = []
-  cur = []
-  f = ReadFile.new fn
-  prev = -1
-  c = 0
-  id = 0
-  while line = f.gets
-    t = translation_type.new
-    t.from_s line
-    c = splitpipe(line)[0].to_i
-    if c != prev
-      if cur.size > 0
-        kbest_lists << cur
-        cur = []
-      end
-      prev = c
-      id = 0
-    end
-    t.id = id
-    cur << t
-    id += 1
-  end
-  kbest_lists << cur # last one
-  f.close
-  return kbest_lists
-end
-
-- 
cgit v1.2.3