From c0daa3e70cc3187f04f67c2cdc0bd3b3217e8aa6 Mon Sep 17 00:00:00 2001
From: Patrick Simianer
Date: Fri, 14 Feb 2014 17:14:49 +0100
Subject: => 0.3; License and README updated; some from_* methods for
SparseVector; ttable.rb => Translation.rb; moved some misc. stuff to misc.rb;
monkey patched String
---
lib/nlp_ruby/PriorityQueue.rb | 37 -------
lib/nlp_ruby/SparseVector.rb | 71 +++++++-------
lib/nlp_ruby/Translation.rb | 66 +++++++++++++
lib/nlp_ruby/bleu.rb | 12 +--
lib/nlp_ruby/cdec.rb | 20 ----
lib/nlp_ruby/dag.rb | 205 +++++++++++++++++++++++++++++++++++++++
lib/nlp_ruby/dags.rb | 218 ------------------------------------------
lib/nlp_ruby/misc.rb | 74 ++++++++++++++
lib/nlp_ruby/semirings.rb | 3 +-
lib/nlp_ruby/stringutil.rb | 41 +-------
lib/nlp_ruby/ttable.rb | 85 ----------------
11 files changed, 394 insertions(+), 438 deletions(-)
delete mode 100644 lib/nlp_ruby/PriorityQueue.rb
create mode 100644 lib/nlp_ruby/Translation.rb
delete mode 100644 lib/nlp_ruby/cdec.rb
create mode 100644 lib/nlp_ruby/dag.rb
delete mode 100644 lib/nlp_ruby/dags.rb
delete mode 100644 lib/nlp_ruby/ttable.rb
(limited to 'lib/nlp_ruby')
diff --git a/lib/nlp_ruby/PriorityQueue.rb b/lib/nlp_ruby/PriorityQueue.rb
deleted file mode 100644
index f090e60..0000000
--- a/lib/nlp_ruby/PriorityQueue.rb
+++ /dev/null
@@ -1,37 +0,0 @@
-# FIXME dags
-# this assumes that elements in the queue
-# have a numerical member named 'score'
-class PriorityQueue
-
- def initialize a=Array.new
- @queue = Array.new a
- sort!
- end
-
- def sort!
- @queue.sort_by! { |i| -i.score }
- end
-
- def pop
- @queue.pop
- end
-
- def push i
- @queue << i
- sort!
- end
-
- def empty?
- @queue.empty?
- end
-
- # FIXME
- def to_s
- a = []
- @queue.each { |i|
- a << "#{i.to_s}[#{i.score}]"
- }
- "[#{a.join ', '}]"
- end
-end
-
diff --git a/lib/nlp_ruby/SparseVector.rb b/lib/nlp_ruby/SparseVector.rb
index 1c0262b..b80373c 100644
--- a/lib/nlp_ruby/SparseVector.rb
+++ b/lib/nlp_ruby/SparseVector.rb
@@ -20,6 +20,34 @@ class SparseVector < Hash
from_h eval(s)
end
+ def to_kv sep='=', join=' '
+ a = []
+ self.each_pair { |k,v|
+ a << "#{k}#{sep}#{v}"
+ }
+ return a.join join
+ end
+
+ def from_kv s
+ s.split.each { |i|
+ k,v = i.split('=')
+ self[k] = v.to_f
+ }
+ end
+
+ def from_file fn, sep='='
+ f = ReadFile.new(fn)
+ while line = f.gets
+ key, value = line.strip.split sep
+ value = value.to_f
+ self[key] = value
+ end
+ end
+
+ def join_keys other
+ self.keys + other.keys
+ end
+
def sum
self.values.inject(:+)
end
@@ -74,38 +102,6 @@ class SparseVector < Hash
return Math.sqrt(sum)
end
- # FIXME
- def from_kv_file fn, sep=' '
- f = ReadFile.new(fn)
- while line = f.gets
- key, value = line.strip.split sep
- value = value.to_f
- self[key] = value
- end
- end
-
- # FIXME
- def to_kv sep='='
- a = []
- self.each_pair { |k,v|
- a << "#{k}#{sep}#{v}"
- }
- return a.join ' '
- end
-
- # FIXME
- def to_kv2 sep='='
- a = []
- self.each_pair { |k,v|
- a << "#{k}#{sep}#{v}"
- }
- return a.join "\n"
- end
-
- def join_keys other
- self.keys + other.keys
- end
-
def + other
new = SparseVector.new
join_keys(other).each { |k|
@@ -132,9 +128,13 @@ class SparseVector < Hash
end
end
-def mean_sparse_vector array_of_vectors
+
+module SparseVector
+
+
+def SparseVector::mean a
mean = SparseVector.new
- array_of_vectors.each { |i|
+ a.each { |i|
i.each_pair { |k,v|
mean[k] += v
}
@@ -144,3 +144,6 @@ def mean_sparse_vector array_of_vectors
return mean
end
+
+end
+
diff --git a/lib/nlp_ruby/Translation.rb b/lib/nlp_ruby/Translation.rb
new file mode 100644
index 0000000..0c346a4
--- /dev/null
+++ b/lib/nlp_ruby/Translation.rb
@@ -0,0 +1,66 @@
+class Translation
+ attr_accessor :id, :s, :raw, :f, :score, :rank, :other_score
+
+ def initialize id=nil, raw=nil, s=nil, f=nil, scores={}, rank=nil
+ @id = id
+ @raw = raw
+ @s = s
+ @f = f
+ @scores = scores
+ @rank = rank
+ end
+
+ def from_s t, strip_alignment=true, rank=nil
+ id, raw, features, score = splitpipe(t, 3)
+ raw.strip!
+ @raw = raw
+ if strip_alignment # the way moses does it
+ @s = @raw.gsub(/\s*\|\d+-\d+\||\|-?\d+\|\s*/, ' ').gsub(/\s+/, ' ')
+ @s.strip!
+ else
+ @s = raw
+ end
+ @id = id.to_i
+ @f = read_feature_string features
+ @scores['decoder'] = score.to_f
+ @rank = rank
+ end
+
+ def to_s include_features=true
+ [@id, @s, @f.to_kv('=', ' '), @scores['decoder']].join(' ||| ') if include_features
+ [@id, @s, @scores['decoder']].join(' ||| ') if !include_features
+ end
+
+ def to_s2
+ [@rank, @s, @score, @scores.to_s].join ' ||| '
+ end
+end
+
+def read_kbest_lists fn, translation_type=Translation
+ kbest_lists = []
+ cur = []
+ f = ReadFile.new fn
+ prev = -1
+ c = 0
+ id = 0
+ while line = f.gets
+ t = translation_type.new
+ t.from_s line
+ c = splitpipe(line)[0].to_i
+ if c != prev
+ if cur.size > 0
+ kbest_lists << cur
+ cur = []
+ end
+ prev = c
+ id = 0
+ end
+ t.id = id
+ cur << t
+ id += 1
+ end
+ kbest_lists << cur # last one
+ f.close
+ return kbest_lists
+end
+
diff --git a/lib/nlp_ruby/bleu.rb b/lib/nlp_ruby/bleu.rb
index ee91985..d7a6b2b 100644
--- a/lib/nlp_ruby/bleu.rb
+++ b/lib/nlp_ruby/bleu.rb
@@ -79,9 +79,9 @@ def BLEU::get_counts hypothesis, reference, n, times=1
return p
end
-def BLEU::brevity_penalty c, r
+def BLEU::brevity_penalty c, r, hack=0.0
return 1.0 if c>r
- return Math.exp(1-r/c)
+ return Math.exp 1.0-((r+hack)/c)
end
def BLEU::bleu counts, n, debug=false
@@ -105,7 +105,7 @@ def BLEU::hbleu counts, n, debug=false
(100*bleu(counts, n, debug)).round(3)
end
-def BLEU::per_sentence_bleu hypothesis, reference, n=4
+def BLEU::per_sentence_bleu hypothesis, reference, n=4, hack=0.0
h_ng = {}; r_ng = {}
(1).upto(n) {|i| h_ng[i] = []; r_ng[i] = []}
ngrams(hypothesis, n) {|i| h_ng[i.size] << i}
@@ -117,13 +117,13 @@ def BLEU::per_sentence_bleu hypothesis, reference, n=4
(1).upto(m) { |i|
counts_clipped = 0
counts_sum = h_ng[i].size
- h_ng[i].uniq.each {|j| counts_clipped += r_ng[i].count(j)}
+ h_ng[i].uniq.each { |j| counts_clipped += r_ng[i].count(j) }
add = 1.0 if i >= 2
sum += weight * Math.log((counts_clipped + add)/(counts_sum + add));
- }
+ }
return brevity_penalty(hypothesis.strip.split.size, reference.strip.split.size) * Math.exp(sum)
end
-end
+end # module
diff --git a/lib/nlp_ruby/cdec.rb b/lib/nlp_ruby/cdec.rb
deleted file mode 100644
index 1080f14..0000000
--- a/lib/nlp_ruby/cdec.rb
+++ /dev/null
@@ -1,20 +0,0 @@
-module CDEC
-
-require 'open3'
-
-
-# FIXME
-CDEC_BINARY = "/toolbox/cdec-dtrain/decoder/cdec"
-
-
-def CDEC::kbest input, ini, weights, k, unique=true
- o, s = Open3.capture2 "echo \"#{input}\" | #{CDEC_BINARY} -c #{ini} -w #{weights} -k #{k} -r 2>/dev/null"
- j = -1
- ret = []
- o.split("\n").map{|i| j+=1; t=Translation.new; t.from_s(i, false, j); ret << t}
- return ret
-end
-
-
-end
-
diff --git a/lib/nlp_ruby/dag.rb b/lib/nlp_ruby/dag.rb
new file mode 100644
index 0000000..cca35c5
--- /dev/null
+++ b/lib/nlp_ruby/dag.rb
@@ -0,0 +1,205 @@
+module DAG
+
+require 'json'
+
+
+class DAG::Node
+ attr_accessor :label, :edges, :incoming, :score, :mark
+
+ def initialize label=nil, edges=[], incoming=[], score=nil
+ @label = label
+ @edges = edges # outgoing
+ @incoming = incoming
+ @score = nil
+ end
+
+ def add_edge head, weight=0
+ exit if self==head # no self-cycles!
+ @edges << DAG::Edge.new(self, head, weight)
+ return @edges.last
+ end
+
+ def to_s
+ "DAG::Node"
+ end
+
+ def repr
+ "#{to_s} #{@score} out:#{@edges} in:[#{@incoming.map{|e| e.to_s}.join ', '}]"
+ end
+end
+
+class DAG::Edge
+ attr_accessor :tail, :head, :weight, :mark
+
+ def initialize tail=nil, head=nil, weight=0
+ @tail = tail
+ @head = head
+ @weight = weight
+ @mark = false # did we already follow this edge? -- for topological sorting
+ end
+
+ def to_s
+ s = "DAG::Edge<#{@tail} ->[#{weight}] #{@head}"
+ s += " x" if @mark
+ s += ">"
+ s
+ end
+end
+
+# depth-first search
+# w/o markings as we do not have cycles
+def DAG::dfs n, target_label
+ return n if n.label==target_label # assumes uniq labels!
+ stack = n.edges.map { |i| i.head }
+ while !stack.empty?
+ m = stack.pop
+ return DAG::dfs m, target_label
+ end
+ return nil
+end
+
+# breadth-first search
+# w/o markings as we do not have cycles
+def DAG::bfs n, target_label
+ queue = [n]
+ while !queue.empty?
+ m = queue.shift
+ return m if m.label==target_label
+ m.edges.each { |e| queue << e.head }
+ end
+ return nil
+end
+
+# topological sort
+def DAG::topological_sort graph
+ sorted = []
+ s = graph.reject { |n| !n.incoming.empty? }
+ while !s.empty?
+ sorted << s.shift
+ sorted.last.edges.each { |e|
+ e.mark = true
+ s << e.head if e.head.incoming.reject{|f| f.mark}.empty?
+ }
+ end
+ return sorted
+end
+
+# initialize graph scores with semiring One
+def DAG::init graph, semiring, source_node
+ graph.each {|n| n.score=semiring.null}
+ source_node.score = semiring.one
+end
+
+# viterbi
+def DAG::viterbi graph, semiring=ViterbiSemiring, source_node
+ toposorted = DAG::topological_sort(graph)
+ DAG::init(graph, semiring, source_node)
+ toposorted.each { |n|
+ n.incoming.each { |e|
+ # update
+ n.score = \
+ semiring.add.call(n.score, \
+ semiring.multiply.call(e.tail.score, e.weight)
+ )
+ }
+ }
+end
+
+# forward viterbi
+def DAG::viterbi_forward graph, semiring=ViterbiSemiring, source_node
+ toposorted = DAG::topological_sort(graph)
+ DAG::init(graph, semiring, source_node)
+ toposorted.each { |n|
+ n.edges.each { |e|
+ e.head.score = \
+ semiring.add.call(e.head.score, \
+ semiring.multiply.call(n.score, e.weight)
+ )
+ }
+ }
+end
+
+# Dijkstra algorithm
+# for A*-search we would need an optimistic estimate of
+# future cost at each node
+def DAG::dijkstra graph, semiring=RealSemiring.new, source_node
+ DAG::init(graph, semiring, source_node)
+ q = PriorityQueue.new graph
+ while !q.empty?
+ n = q.pop
+ n.edges.each { |e|
+ e.head.score = \
+ semiring.add.call(e.head.score, \
+ semiring.multiply.call(n.score, e.weight))
+ q.sort!
+ }
+ end
+end
+
+# Bellman-Ford algorithm
+def DAG::bellman_ford(graph, semiring=RealSemiring.new, source_node)
+ DAG::init(graph, semiring, source_node)
+ edges = []
+ graph.each { |n| edges |= n.edges }
+ # relax edges
+ (graph.size-1).times{ |i|
+ edges.each { |e|
+ e.head.score = \
+ semiring.add.call(e.head.score, \
+ semiring.multiply.call(e.tail.score, e.weight))
+ }
+ }
+ # we do not allow cycles (negative or positive)
+end
+
+# Floyd algorithm
+def DAG::floyd(graph, semiring=nil)
+ dist_matrix = []
+ graph.each_index { |i|
+ dist_matrix << []
+ graph.each_index { |j|
+ val = 1.0/0.0
+ val = 0.0 if i==j
+ dist_matrix.last << val
+ }
+ }
+ edges = []
+ graph.each { |n| edges |= n.edges }
+ edges.each { |e|
+ dist_matrix[graph.index(e.tail)][graph.index(e.head)] = e.weight
+ }
+ 0.upto(graph.size-1) { |k|
+ 0.upto(graph.size-1) { |i|
+ 0.upto(graph.size-1) { |j|
+ if dist_matrix[i][k] + dist_matrix[k][j] < dist_matrix[i][j]
+ dist_matrix [i][j] = dist_matrix[i][k] + dist_matrix[k][j]
+ end
+ }
+ }
+ }
+ return dist_matrix
+end
+
+
+# returns a list of nodes (graph) and a hash for finding
+# nodes by their label (these need to be unique!)
+def DAG::read_graph_from_json fn, semiring=RealSemiring.new
+ graph = []
+ nodes_by_label = {}
+ h = JSON.parse File.new(fn).read
+ h['nodes'].each { |i|
+ n = DAG::Node.new i['label']
+ graph << n
+ nodes_by_label[n.label] = n
+ }
+ h['edges'].each { |i|
+ n = nodes_by_label[i['tail']]
+ a = n.add_edge(nodes_by_label[i['head']], semiring.convert.call(i['weight'].to_f))
+ nodes_by_label[i['head']].incoming << a
+ }
+ return graph, nodes_by_label
+end
+
+
+end # module
+
diff --git a/lib/nlp_ruby/dags.rb b/lib/nlp_ruby/dags.rb
deleted file mode 100644
index 7767be1..0000000
--- a/lib/nlp_ruby/dags.rb
+++ /dev/null
@@ -1,218 +0,0 @@
-###########################
-# TODO
-# output paths
-# visualization?
-# algorithms:
-# beam search
-# best-first
-# kbest
-# kruskal (MST)?
-# transitive closure?
-###########################
-
-require 'json'
-
-
-module DAG
-
-
-class DAG::Node
- attr_accessor :label, :edges, :incoming, :score, :mark
-
- def initialize label=nil, edges=[], incoming=[], score=nil
- @label = label
- @edges = edges # outgoing
- @incoming = incoming
- @score = nil
- end
-
- def add_edge head, weight=0
- exit if self==head # no self-cycles!
- @edges << DAG::Edge.new(self, head, weight)
- return @edges.last
- end
-
- def to_s
- "DAG::Node"
- end
-
- def repr
- "#{to_s} #{@score} out:#{@edges} in:[#{@incoming.map{|e| e.to_s}.join ', '}]"
- end
-end
-
-class DAG::Edge
- attr_accessor :tail, :head, :weight, :mark
-
- def initialize tail=nil, head=nil, weight=0
- @tail = tail
- @head = head
- @weight = weight
- @mark = false # did we already follow this edge? -- for topological sorting
- end
-
- def to_s
- s = "DAG::Edge<#{@tail} ->[#{weight}] #{@head}"
- s += " x" if @mark
- s += ">"
- s
- end
-end
-
-# depth-first search
-# w/o markings as we do not have cycles
-def DAG::dfs n, target_label
- return n if n.label==target_label # assumes uniq labels!
- stack = n.edges.map { |i| i.head }
- while !stack.empty?
- m = stack.pop
- return DAG::dfs m, target_label
- end
- return nil
-end
-
-# breadth-first search
-# w/o markings as we do not have cycles
-def DAG::bfs n, target_label
- queue = [n]
- while !queue.empty?
- m = queue.shift
- return m if m.label==target_label
- m.edges.each { |e| queue << e.head }
- end
- return nil
-end
-
-# topological sort
-def DAG::topological_sort graph
- sorted = []
- s = graph.reject { |n| !n.incoming.empty? }
- while !s.empty?
- sorted << s.shift
- sorted.last.edges.each { |e|
- e.mark = true
- s << e.head if e.head.incoming.reject{|f| f.mark}.empty?
- }
- end
- return sorted
-end
-
-# initialize graph scores with semiring One
-def DAG::init graph, semiring, source_node
- graph.each {|n| n.score=semiring.null}
- source_node.score = semiring.one
-end
-
-# viterbi
-def DAG::viterbi graph, semiring=ViterbiSemiring, source_node
- toposorted = DAG::topological_sort(graph)
- DAG::init(graph, semiring, source_node)
- toposorted.each { |n|
- n.incoming.each { |e|
- # update
- n.score = \
- semiring.add.call(n.score, \
- semiring.multiply.call(e.tail.score, e.weight)
- )
- }
- }
-end
-
-# forward viterbi
-def DAG::viterbi_forward graph, semiring=ViterbiSemiring, source_node
- toposorted = DAG::topological_sort(graph)
- DAG::init(graph, semiring, source_node)
- toposorted.each { |n|
- n.edges.each { |e|
- e.head.score = \
- semiring.add.call(e.head.score, \
- semiring.multiply.call(n.score, e.weight)
- )
- }
- }
-end
-
-# Dijkstra algorithm
-# for A*-search we would need an optimistic estimate of
-# future cost at each node
-def DAG::dijkstra graph, semiring=RealSemiring.new, source_node
- DAG::init(graph, semiring, source_node)
- q = PriorityQueue.new graph
- while !q.empty?
- n = q.pop
- n.edges.each { |e|
- e.head.score = \
- semiring.add.call(e.head.score, \
- semiring.multiply.call(n.score, e.weight))
- q.sort!
- }
- end
-end
-
-# Bellman-Ford algorithm
-def DAG::bellman_ford(graph, semiring=RealSemiring.new, source_node)
- DAG::init(graph, semiring, source_node)
- edges = []
- graph.each { |n| edges |= n.edges }
- # relax edges
- (graph.size-1).times{ |i|
- edges.each { |e|
- e.head.score = \
- semiring.add.call(e.head.score, \
- semiring.multiply.call(e.tail.score, e.weight))
- }
- }
- # we do not allow cycles (negative or positive)
-end
-
-# Floyd algorithm
-def DAG::floyd(graph, semiring=nil)
- dist_matrix = []
- graph.each_index { |i|
- dist_matrix << []
- graph.each_index { |j|
- val = 1.0/0.0
- val = 0.0 if i==j
- dist_matrix.last << val
- }
- }
- edges = []
- graph.each { |n| edges |= n.edges }
- edges.each { |e|
- dist_matrix[graph.index(e.tail)][graph.index(e.head)] = e.weight
- }
- 0.upto(graph.size-1) { |k|
- 0.upto(graph.size-1) { |i|
- 0.upto(graph.size-1) { |j|
- if dist_matrix[i][k] + dist_matrix[k][j] < dist_matrix[i][j]
- dist_matrix [i][j] = dist_matrix[i][k] + dist_matrix[k][j]
- end
- }
- }
- }
- return dist_matrix
-end
-
-
-# returns a list of nodes (graph) and a hash for finding
-# nodes by their label (these need to be unique!)
-def DAG::read_graph_from_json fn, semiring=RealSemiring.new
- graph = []
- nodes_by_label = {}
- h = JSON.parse File.new(fn).read
- h['nodes'].each { |i|
- n = DAG::Node.new i['label']
- graph << n
- nodes_by_label[n.label] = n
- }
- h['edges'].each { |i|
- n = nodes_by_label[i['tail']]
- a = n.add_edge(nodes_by_label[i['head']], semiring.convert.call(i['weight'].to_f))
- nodes_by_label[i['head']].incoming << a
- }
- return graph, nodes_by_label
-end
-
-
-end # module
-
diff --git a/lib/nlp_ruby/misc.rb b/lib/nlp_ruby/misc.rb
index 80d932c..0f58100 100644
--- a/lib/nlp_ruby/misc.rb
+++ b/lib/nlp_ruby/misc.rb
@@ -21,6 +21,40 @@ class Array
end
end
+class String
+
+ def downcase? s
+ s[/[[:lower:]]/]
+ end
+end
+
+class PriorityQueue
+# This assumes that elements in the queue
+# have a numerical member named 'score'.
+
+ def initialize a=Array.new
+ @queue = Array.new a
+ sort!
+ end
+
+ def sort!
+ @queue.sort_by! { |i| -i.score }
+ end
+
+ def pop
+ @queue.pop
+ end
+
+ def push i
+ @queue << i
+ sort!
+ end
+
+ def empty?
+ @queue.empty?
+ end
+end
+
def spawn_with_timeout cmd, t=4, debug=false
require 'timeout'
STDERR.write cmd+"\n" if debug
@@ -37,4 +71,44 @@ def spawn_with_timeout cmd, t=4, debug=false
return pipe_in.read
end
+def read_phrase_table fn
+ table = {}
+ f = ReadFile.new fn
+ while raw_rule = f.gets
+ french, english, features = splitpipe(raw_rule)
+ feature_map = read_feature_string(features)
+ if table.has_key? french
+ table[french] << [english, feature_map ]
+ else
+ table[french] = [[english, feature_map]]
+ end
+ end
+ f.close
+ return table
+end
+
+def cdec_kbest cdec_bin, input, ini, weights, k, unique=true
+ require 'open3'
+ cmd = "echo \"#{input}\" | #{cdec_bin} -c #{ini} -w #{weights} -k #{k}"
+ cmd += " -r" if unique
+ o,_ = Open3.capture2 "#{cmd} 2>/dev/null"
+ a = []; j = -1
+ o.split("\n").map{ |i| j+=1; t=Translation.new; t.from_s(i, false, j); a << t }
+ return a
+end
+
+def read_config fn
+ f = ReadFile.new fn
+ cfg = {}
+ while line = f.gets
+ line.strip!
+ next if /^\s*$/.match line
+ next if line[0]=='#'
+ content = line.split('#', 2).first
+ k, v = content.split(/\s*=\s*/, 2)
+ k.strip!; v.strip!
+ cfg[k] = v
+ end
+ return cfg
+end
diff --git a/lib/nlp_ruby/semirings.rb b/lib/nlp_ruby/semirings.rb
index a06f151..83551a9 100644
--- a/lib/nlp_ruby/semirings.rb
+++ b/lib/nlp_ruby/semirings.rb
@@ -1,4 +1,5 @@
-# semirings for graphs as described in
+# Semirings for directed acyclic graphs (dags) (also directed hypergraphs),
+# as described in:
# 'Dynamic Programming Algorithms in
# Semiring and Hypergraph Frameworks' (Liang Huang)
class Semiring
diff --git a/lib/nlp_ruby/stringutil.rb b/lib/nlp_ruby/stringutil.rb
index d7381bb..aa9be00 100644
--- a/lib/nlp_ruby/stringutil.rb
+++ b/lib/nlp_ruby/stringutil.rb
@@ -1,17 +1,7 @@
-# whitespace 'tokenizer'
def tokenize s
s.strip.split
end
-def splitpipe s, n=3
- s.strip.split("|"*n)
-end
-
-def downcase? s
- s[/[[:lower:]]/]
-end
-
-# iterator over n-grams
def ngrams(s, n, fix=false)
a = tokenize s
a.each_with_index { |tok, i|
@@ -22,34 +12,11 @@ def ngrams(s, n, fix=false)
}
end
-# a=1.0 b=2.0 => { 'a' => 1.0, 'b' => 2.0 }
-def read_feature_string s
- map = SparseVector.new
- tokenize(s).each { |i|
- key, value = i.split '='
- map[key] = value.to_f
- }
- return map
-end
-
-
-def read_cfg fn
- f = ReadFile.new fn
- cfg = {}
- while line = f.gets
- line.strip!
- next if /^\s*$/.match line
- next if line[0]=='#'
- content = line.split('#', 2).first
- k, v = content.split(/\s*=\s*/, 2)
- k.strip!; v.strip!
- cfg[k] = v
- end
- return cfg
-end
-
def bag_of_words s, stopwords=[]
s.strip.split.uniq.sort.reject{ |w| stopwords.include? w }
-end
+end
+def splitpipe s, n=3
+ s.strip.split("|"*n)
+end
diff --git a/lib/nlp_ruby/ttable.rb b/lib/nlp_ruby/ttable.rb
deleted file mode 100644
index c0f37be..0000000
--- a/lib/nlp_ruby/ttable.rb
+++ /dev/null
@@ -1,85 +0,0 @@
-# table['some French string'] = [Array of English strings]
-def read_phrase_table fn
- table = {}
- f = ReadFile.new fn
- while raw_rule = f.gets
- french, english, features = splitpipe(raw_rule)
- feature_map = read_feature_string(features)
- if table.has_key? french
- table[french] << [english, feature_map ]
- else
- table[french] = [[english, feature_map]]
- end
- end
- f.close
- return table
-end
-
-# FIXME
-class Translation
- attr_accessor :id, :s, :raw, :f, :score, :rank, :other_score
-
- def initialize id=nil, raw=nil, s=nil, f=nil, score=nil, rank=nil, other_score=nil
- @id = id
- @raw = raw
- @s = s
- @f = f
- @score = score
- @rank = rank
- @other_score = other_score
- end
-
- def from_s t, strip_alignment=true, rank=nil
- id, raw, features, score = splitpipe(t, 3)
- raw.strip!
- @raw = raw
- if strip_alignment # the way moses does it
- @s = @raw.gsub(/\s*\|\d+-\d+\||\|-?\d+\|\s*/, ' ').gsub(/\s+/, ' ')
- @s.strip!
- else
- @s = raw
- end
- @id = id.to_i
- @f = read_feature_string features
- @score = score.to_f
- @rank = rank
- @other_score = nil
- end
-
- def to_s
- [@id, @s, @f.to_kv, @score].join ' ||| '
- end
-
- def to_s2
- [@rank, @s, @score, @other_score].join ' ||| '
- end
-end
-
-def read_kbest_lists fn, translation_type=Translation
- kbest_lists = []
- cur = []
- f = ReadFile.new fn
- prev = -1
- c = 0
- id = 0
- while line = f.gets
- t = translation_type.new
- t.from_s line
- c = splitpipe(line)[0].to_i
- if c != prev
- if cur.size > 0
- kbest_lists << cur
- cur = []
- end
- prev = c
- id = 0
- end
- t.id = id
- cur << t
- id += 1
- end
- kbest_lists << cur # last one
- f.close
- return kbest_lists
-end
-
--
cgit v1.2.3