From c0daa3e70cc3187f04f67c2cdc0bd3b3217e8aa6 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Fri, 14 Feb 2014 17:14:49 +0100 Subject: => 0.3; License and README updated; some from_* methods for SparseVector; ttable.rb => Translation.rb; moved some misc. stuff to misc.rb; monkey patched String --- LICENSE | 22 ++++- README.md | 18 ++-- lib/nlp_ruby.rb | 6 +- lib/nlp_ruby/PriorityQueue.rb | 37 ------- lib/nlp_ruby/SparseVector.rb | 71 +++++++------- lib/nlp_ruby/Translation.rb | 66 +++++++++++++ lib/nlp_ruby/bleu.rb | 12 +-- lib/nlp_ruby/cdec.rb | 20 ---- lib/nlp_ruby/dag.rb | 205 +++++++++++++++++++++++++++++++++++++++ lib/nlp_ruby/dags.rb | 218 ------------------------------------------ lib/nlp_ruby/misc.rb | 74 ++++++++++++++ lib/nlp_ruby/semirings.rb | 3 +- lib/nlp_ruby/stringutil.rb | 41 +------- lib/nlp_ruby/ttable.rb | 85 ---------------- 14 files changed, 424 insertions(+), 454 deletions(-) delete mode 100644 lib/nlp_ruby/PriorityQueue.rb create mode 100644 lib/nlp_ruby/Translation.rb delete mode 100644 lib/nlp_ruby/cdec.rb create mode 100644 lib/nlp_ruby/dag.rb delete mode 100644 lib/nlp_ruby/dags.rb delete mode 100644 lib/nlp_ruby/ttable.rb diff --git a/LICENSE b/LICENSE index 0d5dab3..cf4d89e 100644 --- a/LICENSE +++ b/LICENSE @@ -1,7 +1,23 @@ +The MIT License + Copyright (C) 2014 Patrick Simianer

+http://simianer.de + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: -Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. -The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/README.md b/README.md index 26858ff..5db1487 100644 --- a/README.md +++ b/README.md @@ -4,13 +4,13 @@ nlp_ruby My little NLP library, supposed to make _my_ work a little easier and less redundant. The .gem can be found here: https://rubygems.org/gems/nlp_ruby -* dags.rb : implementation of a directed acyclic graph and various algorithms -* fileutil.rb : file utilities -* PriorityQueue.rb : a simple priority queue -* semirings.rb : semirings for dags.rb -* SparseVector.rb : sparse vectors for ruby, based on Hash -* stringutil.rb : string utilities -* tfidf.rb : functions to calculate tf/ntf/idf -* ttable.rb : functions to read MT phrase tables -* Vector.rb : vector class based on Array + bleu.rb : BLEU implementation, also per-sentence-BLEU + dag.rb : implementation of a directed acyclic graph and various algorithms + fileutil.rb : file utilities + misc.rb : misc. stuff (e.g. monkey patches for Array and String) + semirings.rb : semirings (used in dags.rb) + SparseVector.rb : sparse vectors for ruby, based on Hash class + stringutil.rb : string utilities + tfidf.rb : functions to calculate tf/ntf/idf + Translation.rb : an object for diff --git a/lib/nlp_ruby.rb b/lib/nlp_ruby.rb index c7af97b..a3f9f1a 100755 --- a/lib/nlp_ruby.rb +++ b/lib/nlp_ruby.rb @@ -3,14 +3,12 @@ require 'nlp_ruby/stringutil' require 'nlp_ruby/fileutil' require 'nlp_ruby/SparseVector' -require 'nlp_ruby/PriorityQueue' require 'nlp_ruby/tfidf' -require 'nlp_ruby/ttable' -require 'nlp_ruby/dags' +require 'nlp_ruby/Translation' +require 'nlp_ruby/dag' require 'nlp_ruby/semirings' require 'nlp_ruby/bleu' require 'nlp_ruby/misc' -require 'nlp_ruby/cdec' STDIN.set_encoding 'utf-8' STDOUT.set_encoding 'utf-8' diff --git a/lib/nlp_ruby/PriorityQueue.rb b/lib/nlp_ruby/PriorityQueue.rb deleted file mode 100644 index f090e60..0000000 --- a/lib/nlp_ruby/PriorityQueue.rb +++ /dev/null @@ -1,37 +0,0 @@ -# FIXME dags -# this assumes that elements in the queue -# have a numerical member named 'score' -class PriorityQueue - - def initialize a=Array.new - @queue = Array.new a - sort! - end - - def sort! - @queue.sort_by! { |i| -i.score } - end - - def pop - @queue.pop - end - - def push i - @queue << i - sort! - end - - def empty? - @queue.empty? - end - - # FIXME - def to_s - a = [] - @queue.each { |i| - a << "#{i.to_s}[#{i.score}]" - } - "[#{a.join ', '}]" - end -end - diff --git a/lib/nlp_ruby/SparseVector.rb b/lib/nlp_ruby/SparseVector.rb index 1c0262b..b80373c 100644 --- a/lib/nlp_ruby/SparseVector.rb +++ b/lib/nlp_ruby/SparseVector.rb @@ -20,6 +20,34 @@ class SparseVector < Hash from_h eval(s) end + def to_kv sep='=', join=' ' + a = [] + self.each_pair { |k,v| + a << "#{k}#{sep}#{v}" + } + return a.join join + end + + def from_kv s + s.split.each { |i| + k,v = i.split('=') + self[k] = v.to_f + } + end + + def from_file fn, sep='=' + f = ReadFile.new(fn) + while line = f.gets + key, value = line.strip.split sep + value = value.to_f + self[key] = value + end + end + + def join_keys other + self.keys + other.keys + end + def sum self.values.inject(:+) end @@ -74,38 +102,6 @@ class SparseVector < Hash return Math.sqrt(sum) end - # FIXME - def from_kv_file fn, sep=' ' - f = ReadFile.new(fn) - while line = f.gets - key, value = line.strip.split sep - value = value.to_f - self[key] = value - end - end - - # FIXME - def to_kv sep='=' - a = [] - self.each_pair { |k,v| - a << "#{k}#{sep}#{v}" - } - return a.join ' ' - end - - # FIXME - def to_kv2 sep='=' - a = [] - self.each_pair { |k,v| - a << "#{k}#{sep}#{v}" - } - return a.join "\n" - end - - def join_keys other - self.keys + other.keys - end - def + other new = SparseVector.new join_keys(other).each { |k| @@ -132,9 +128,13 @@ class SparseVector < Hash end end -def mean_sparse_vector array_of_vectors + +module SparseVector + + +def SparseVector::mean a mean = SparseVector.new - array_of_vectors.each { |i| + a.each { |i| i.each_pair { |k,v| mean[k] += v } @@ -144,3 +144,6 @@ def mean_sparse_vector array_of_vectors return mean end + +end + diff --git a/lib/nlp_ruby/Translation.rb b/lib/nlp_ruby/Translation.rb new file mode 100644 index 0000000..0c346a4 --- /dev/null +++ b/lib/nlp_ruby/Translation.rb @@ -0,0 +1,66 @@ +class Translation + attr_accessor :id, :s, :raw, :f, :score, :rank, :other_score + + def initialize id=nil, raw=nil, s=nil, f=nil, scores={}, rank=nil + @id = id + @raw = raw + @s = s + @f = f + @scores = scores + @rank = rank + end + + def from_s t, strip_alignment=true, rank=nil + id, raw, features, score = splitpipe(t, 3) + raw.strip! + @raw = raw + if strip_alignment # the way moses does it + @s = @raw.gsub(/\s*\|\d+-\d+\||\|-?\d+\|\s*/, ' ').gsub(/\s+/, ' ') + @s.strip! + else + @s = raw + end + @id = id.to_i + @f = read_feature_string features + @scores['decoder'] = score.to_f + @rank = rank + end + + def to_s include_features=true + [@id, @s, @f.to_kv('=', ' '), @scores['decoder']].join(' ||| ') if include_features + [@id, @s, @scores['decoder']].join(' ||| ') if !include_features + end + + def to_s2 + [@rank, @s, @score, @scores.to_s].join ' ||| ' + end +end + +def read_kbest_lists fn, translation_type=Translation + kbest_lists = [] + cur = [] + f = ReadFile.new fn + prev = -1 + c = 0 + id = 0 + while line = f.gets + t = translation_type.new + t.from_s line + c = splitpipe(line)[0].to_i + if c != prev + if cur.size > 0 + kbest_lists << cur + cur = [] + end + prev = c + id = 0 + end + t.id = id + cur << t + id += 1 + end + kbest_lists << cur # last one + f.close + return kbest_lists +end + diff --git a/lib/nlp_ruby/bleu.rb b/lib/nlp_ruby/bleu.rb index ee91985..d7a6b2b 100644 --- a/lib/nlp_ruby/bleu.rb +++ b/lib/nlp_ruby/bleu.rb @@ -79,9 +79,9 @@ def BLEU::get_counts hypothesis, reference, n, times=1 return p end -def BLEU::brevity_penalty c, r +def BLEU::brevity_penalty c, r, hack=0.0 return 1.0 if c>r - return Math.exp(1-r/c) + return Math.exp 1.0-((r+hack)/c) end def BLEU::bleu counts, n, debug=false @@ -105,7 +105,7 @@ def BLEU::hbleu counts, n, debug=false (100*bleu(counts, n, debug)).round(3) end -def BLEU::per_sentence_bleu hypothesis, reference, n=4 +def BLEU::per_sentence_bleu hypothesis, reference, n=4, hack=0.0 h_ng = {}; r_ng = {} (1).upto(n) {|i| h_ng[i] = []; r_ng[i] = []} ngrams(hypothesis, n) {|i| h_ng[i.size] << i} @@ -117,13 +117,13 @@ def BLEU::per_sentence_bleu hypothesis, reference, n=4 (1).upto(m) { |i| counts_clipped = 0 counts_sum = h_ng[i].size - h_ng[i].uniq.each {|j| counts_clipped += r_ng[i].count(j)} + h_ng[i].uniq.each { |j| counts_clipped += r_ng[i].count(j) } add = 1.0 if i >= 2 sum += weight * Math.log((counts_clipped + add)/(counts_sum + add)); - } + } return brevity_penalty(hypothesis.strip.split.size, reference.strip.split.size) * Math.exp(sum) end -end +end # module diff --git a/lib/nlp_ruby/cdec.rb b/lib/nlp_ruby/cdec.rb deleted file mode 100644 index 1080f14..0000000 --- a/lib/nlp_ruby/cdec.rb +++ /dev/null @@ -1,20 +0,0 @@ -module CDEC - -require 'open3' - - -# FIXME -CDEC_BINARY = "/toolbox/cdec-dtrain/decoder/cdec" - - -def CDEC::kbest input, ini, weights, k, unique=true - o, s = Open3.capture2 "echo \"#{input}\" | #{CDEC_BINARY} -c #{ini} -w #{weights} -k #{k} -r 2>/dev/null" - j = -1 - ret = [] - o.split("\n").map{|i| j+=1; t=Translation.new; t.from_s(i, false, j); ret << t} - return ret -end - - -end - diff --git a/lib/nlp_ruby/dag.rb b/lib/nlp_ruby/dag.rb new file mode 100644 index 0000000..cca35c5 --- /dev/null +++ b/lib/nlp_ruby/dag.rb @@ -0,0 +1,205 @@ +module DAG + +require 'json' + + +class DAG::Node + attr_accessor :label, :edges, :incoming, :score, :mark + + def initialize label=nil, edges=[], incoming=[], score=nil + @label = label + @edges = edges # outgoing + @incoming = incoming + @score = nil + end + + def add_edge head, weight=0 + exit if self==head # no self-cycles! + @edges << DAG::Edge.new(self, head, weight) + return @edges.last + end + + def to_s + "DAG::Node" + end + + def repr + "#{to_s} #{@score} out:#{@edges} in:[#{@incoming.map{|e| e.to_s}.join ', '}]" + end +end + +class DAG::Edge + attr_accessor :tail, :head, :weight, :mark + + def initialize tail=nil, head=nil, weight=0 + @tail = tail + @head = head + @weight = weight + @mark = false # did we already follow this edge? -- for topological sorting + end + + def to_s + s = "DAG::Edge<#{@tail} ->[#{weight}] #{@head}" + s += " x" if @mark + s += ">" + s + end +end + +# depth-first search +# w/o markings as we do not have cycles +def DAG::dfs n, target_label + return n if n.label==target_label # assumes uniq labels! + stack = n.edges.map { |i| i.head } + while !stack.empty? + m = stack.pop + return DAG::dfs m, target_label + end + return nil +end + +# breadth-first search +# w/o markings as we do not have cycles +def DAG::bfs n, target_label + queue = [n] + while !queue.empty? + m = queue.shift + return m if m.label==target_label + m.edges.each { |e| queue << e.head } + end + return nil +end + +# topological sort +def DAG::topological_sort graph + sorted = [] + s = graph.reject { |n| !n.incoming.empty? } + while !s.empty? + sorted << s.shift + sorted.last.edges.each { |e| + e.mark = true + s << e.head if e.head.incoming.reject{|f| f.mark}.empty? + } + end + return sorted +end + +# initialize graph scores with semiring One +def DAG::init graph, semiring, source_node + graph.each {|n| n.score=semiring.null} + source_node.score = semiring.one +end + +# viterbi +def DAG::viterbi graph, semiring=ViterbiSemiring, source_node + toposorted = DAG::topological_sort(graph) + DAG::init(graph, semiring, source_node) + toposorted.each { |n| + n.incoming.each { |e| + # update + n.score = \ + semiring.add.call(n.score, \ + semiring.multiply.call(e.tail.score, e.weight) + ) + } + } +end + +# forward viterbi +def DAG::viterbi_forward graph, semiring=ViterbiSemiring, source_node + toposorted = DAG::topological_sort(graph) + DAG::init(graph, semiring, source_node) + toposorted.each { |n| + n.edges.each { |e| + e.head.score = \ + semiring.add.call(e.head.score, \ + semiring.multiply.call(n.score, e.weight) + ) + } + } +end + +# Dijkstra algorithm +# for A*-search we would need an optimistic estimate of +# future cost at each node +def DAG::dijkstra graph, semiring=RealSemiring.new, source_node + DAG::init(graph, semiring, source_node) + q = PriorityQueue.new graph + while !q.empty? + n = q.pop + n.edges.each { |e| + e.head.score = \ + semiring.add.call(e.head.score, \ + semiring.multiply.call(n.score, e.weight)) + q.sort! + } + end +end + +# Bellman-Ford algorithm +def DAG::bellman_ford(graph, semiring=RealSemiring.new, source_node) + DAG::init(graph, semiring, source_node) + edges = [] + graph.each { |n| edges |= n.edges } + # relax edges + (graph.size-1).times{ |i| + edges.each { |e| + e.head.score = \ + semiring.add.call(e.head.score, \ + semiring.multiply.call(e.tail.score, e.weight)) + } + } + # we do not allow cycles (negative or positive) +end + +# Floyd algorithm +def DAG::floyd(graph, semiring=nil) + dist_matrix = [] + graph.each_index { |i| + dist_matrix << [] + graph.each_index { |j| + val = 1.0/0.0 + val = 0.0 if i==j + dist_matrix.last << val + } + } + edges = [] + graph.each { |n| edges |= n.edges } + edges.each { |e| + dist_matrix[graph.index(e.tail)][graph.index(e.head)] = e.weight + } + 0.upto(graph.size-1) { |k| + 0.upto(graph.size-1) { |i| + 0.upto(graph.size-1) { |j| + if dist_matrix[i][k] + dist_matrix[k][j] < dist_matrix[i][j] + dist_matrix [i][j] = dist_matrix[i][k] + dist_matrix[k][j] + end + } + } + } + return dist_matrix +end + + +# returns a list of nodes (graph) and a hash for finding +# nodes by their label (these need to be unique!) +def DAG::read_graph_from_json fn, semiring=RealSemiring.new + graph = [] + nodes_by_label = {} + h = JSON.parse File.new(fn).read + h['nodes'].each { |i| + n = DAG::Node.new i['label'] + graph << n + nodes_by_label[n.label] = n + } + h['edges'].each { |i| + n = nodes_by_label[i['tail']] + a = n.add_edge(nodes_by_label[i['head']], semiring.convert.call(i['weight'].to_f)) + nodes_by_label[i['head']].incoming << a + } + return graph, nodes_by_label +end + + +end # module + diff --git a/lib/nlp_ruby/dags.rb b/lib/nlp_ruby/dags.rb deleted file mode 100644 index 7767be1..0000000 --- a/lib/nlp_ruby/dags.rb +++ /dev/null @@ -1,218 +0,0 @@ -########################### -# TODO -# output paths -# visualization? -# algorithms: -# beam search -# best-first -# kbest -# kruskal (MST)? -# transitive closure? -########################### - -require 'json' - - -module DAG - - -class DAG::Node - attr_accessor :label, :edges, :incoming, :score, :mark - - def initialize label=nil, edges=[], incoming=[], score=nil - @label = label - @edges = edges # outgoing - @incoming = incoming - @score = nil - end - - def add_edge head, weight=0 - exit if self==head # no self-cycles! - @edges << DAG::Edge.new(self, head, weight) - return @edges.last - end - - def to_s - "DAG::Node" - end - - def repr - "#{to_s} #{@score} out:#{@edges} in:[#{@incoming.map{|e| e.to_s}.join ', '}]" - end -end - -class DAG::Edge - attr_accessor :tail, :head, :weight, :mark - - def initialize tail=nil, head=nil, weight=0 - @tail = tail - @head = head - @weight = weight - @mark = false # did we already follow this edge? -- for topological sorting - end - - def to_s - s = "DAG::Edge<#{@tail} ->[#{weight}] #{@head}" - s += " x" if @mark - s += ">" - s - end -end - -# depth-first search -# w/o markings as we do not have cycles -def DAG::dfs n, target_label - return n if n.label==target_label # assumes uniq labels! - stack = n.edges.map { |i| i.head } - while !stack.empty? - m = stack.pop - return DAG::dfs m, target_label - end - return nil -end - -# breadth-first search -# w/o markings as we do not have cycles -def DAG::bfs n, target_label - queue = [n] - while !queue.empty? - m = queue.shift - return m if m.label==target_label - m.edges.each { |e| queue << e.head } - end - return nil -end - -# topological sort -def DAG::topological_sort graph - sorted = [] - s = graph.reject { |n| !n.incoming.empty? } - while !s.empty? - sorted << s.shift - sorted.last.edges.each { |e| - e.mark = true - s << e.head if e.head.incoming.reject{|f| f.mark}.empty? - } - end - return sorted -end - -# initialize graph scores with semiring One -def DAG::init graph, semiring, source_node - graph.each {|n| n.score=semiring.null} - source_node.score = semiring.one -end - -# viterbi -def DAG::viterbi graph, semiring=ViterbiSemiring, source_node - toposorted = DAG::topological_sort(graph) - DAG::init(graph, semiring, source_node) - toposorted.each { |n| - n.incoming.each { |e| - # update - n.score = \ - semiring.add.call(n.score, \ - semiring.multiply.call(e.tail.score, e.weight) - ) - } - } -end - -# forward viterbi -def DAG::viterbi_forward graph, semiring=ViterbiSemiring, source_node - toposorted = DAG::topological_sort(graph) - DAG::init(graph, semiring, source_node) - toposorted.each { |n| - n.edges.each { |e| - e.head.score = \ - semiring.add.call(e.head.score, \ - semiring.multiply.call(n.score, e.weight) - ) - } - } -end - -# Dijkstra algorithm -# for A*-search we would need an optimistic estimate of -# future cost at each node -def DAG::dijkstra graph, semiring=RealSemiring.new, source_node - DAG::init(graph, semiring, source_node) - q = PriorityQueue.new graph - while !q.empty? - n = q.pop - n.edges.each { |e| - e.head.score = \ - semiring.add.call(e.head.score, \ - semiring.multiply.call(n.score, e.weight)) - q.sort! - } - end -end - -# Bellman-Ford algorithm -def DAG::bellman_ford(graph, semiring=RealSemiring.new, source_node) - DAG::init(graph, semiring, source_node) - edges = [] - graph.each { |n| edges |= n.edges } - # relax edges - (graph.size-1).times{ |i| - edges.each { |e| - e.head.score = \ - semiring.add.call(e.head.score, \ - semiring.multiply.call(e.tail.score, e.weight)) - } - } - # we do not allow cycles (negative or positive) -end - -# Floyd algorithm -def DAG::floyd(graph, semiring=nil) - dist_matrix = [] - graph.each_index { |i| - dist_matrix << [] - graph.each_index { |j| - val = 1.0/0.0 - val = 0.0 if i==j - dist_matrix.last << val - } - } - edges = [] - graph.each { |n| edges |= n.edges } - edges.each { |e| - dist_matrix[graph.index(e.tail)][graph.index(e.head)] = e.weight - } - 0.upto(graph.size-1) { |k| - 0.upto(graph.size-1) { |i| - 0.upto(graph.size-1) { |j| - if dist_matrix[i][k] + dist_matrix[k][j] < dist_matrix[i][j] - dist_matrix [i][j] = dist_matrix[i][k] + dist_matrix[k][j] - end - } - } - } - return dist_matrix -end - - -# returns a list of nodes (graph) and a hash for finding -# nodes by their label (these need to be unique!) -def DAG::read_graph_from_json fn, semiring=RealSemiring.new - graph = [] - nodes_by_label = {} - h = JSON.parse File.new(fn).read - h['nodes'].each { |i| - n = DAG::Node.new i['label'] - graph << n - nodes_by_label[n.label] = n - } - h['edges'].each { |i| - n = nodes_by_label[i['tail']] - a = n.add_edge(nodes_by_label[i['head']], semiring.convert.call(i['weight'].to_f)) - nodes_by_label[i['head']].incoming << a - } - return graph, nodes_by_label -end - - -end # module - diff --git a/lib/nlp_ruby/misc.rb b/lib/nlp_ruby/misc.rb index 80d932c..0f58100 100644 --- a/lib/nlp_ruby/misc.rb +++ b/lib/nlp_ruby/misc.rb @@ -21,6 +21,40 @@ class Array end end +class String + + def downcase? s + s[/[[:lower:]]/] + end +end + +class PriorityQueue +# This assumes that elements in the queue +# have a numerical member named 'score'. + + def initialize a=Array.new + @queue = Array.new a + sort! + end + + def sort! + @queue.sort_by! { |i| -i.score } + end + + def pop + @queue.pop + end + + def push i + @queue << i + sort! + end + + def empty? + @queue.empty? + end +end + def spawn_with_timeout cmd, t=4, debug=false require 'timeout' STDERR.write cmd+"\n" if debug @@ -37,4 +71,44 @@ def spawn_with_timeout cmd, t=4, debug=false return pipe_in.read end +def read_phrase_table fn + table = {} + f = ReadFile.new fn + while raw_rule = f.gets + french, english, features = splitpipe(raw_rule) + feature_map = read_feature_string(features) + if table.has_key? french + table[french] << [english, feature_map ] + else + table[french] = [[english, feature_map]] + end + end + f.close + return table +end + +def cdec_kbest cdec_bin, input, ini, weights, k, unique=true + require 'open3' + cmd = "echo \"#{input}\" | #{cdec_bin} -c #{ini} -w #{weights} -k #{k}" + cmd += " -r" if unique + o,_ = Open3.capture2 "#{cmd} 2>/dev/null" + a = []; j = -1 + o.split("\n").map{ |i| j+=1; t=Translation.new; t.from_s(i, false, j); a << t } + return a +end + +def read_config fn + f = ReadFile.new fn + cfg = {} + while line = f.gets + line.strip! + next if /^\s*$/.match line + next if line[0]=='#' + content = line.split('#', 2).first + k, v = content.split(/\s*=\s*/, 2) + k.strip!; v.strip! + cfg[k] = v + end + return cfg +end diff --git a/lib/nlp_ruby/semirings.rb b/lib/nlp_ruby/semirings.rb index a06f151..83551a9 100644 --- a/lib/nlp_ruby/semirings.rb +++ b/lib/nlp_ruby/semirings.rb @@ -1,4 +1,5 @@ -# semirings for graphs as described in +# Semirings for directed acyclic graphs (dags) (also directed hypergraphs), +# as described in: # 'Dynamic Programming Algorithms in # Semiring and Hypergraph Frameworks' (Liang Huang) class Semiring diff --git a/lib/nlp_ruby/stringutil.rb b/lib/nlp_ruby/stringutil.rb index d7381bb..aa9be00 100644 --- a/lib/nlp_ruby/stringutil.rb +++ b/lib/nlp_ruby/stringutil.rb @@ -1,17 +1,7 @@ -# whitespace 'tokenizer' def tokenize s s.strip.split end -def splitpipe s, n=3 - s.strip.split("|"*n) -end - -def downcase? s - s[/[[:lower:]]/] -end - -# iterator over n-grams def ngrams(s, n, fix=false) a = tokenize s a.each_with_index { |tok, i| @@ -22,34 +12,11 @@ def ngrams(s, n, fix=false) } end -# a=1.0 b=2.0 => { 'a' => 1.0, 'b' => 2.0 } -def read_feature_string s - map = SparseVector.new - tokenize(s).each { |i| - key, value = i.split '=' - map[key] = value.to_f - } - return map -end - - -def read_cfg fn - f = ReadFile.new fn - cfg = {} - while line = f.gets - line.strip! - next if /^\s*$/.match line - next if line[0]=='#' - content = line.split('#', 2).first - k, v = content.split(/\s*=\s*/, 2) - k.strip!; v.strip! - cfg[k] = v - end - return cfg -end - def bag_of_words s, stopwords=[] s.strip.split.uniq.sort.reject{ |w| stopwords.include? w } -end +end +def splitpipe s, n=3 + s.strip.split("|"*n) +end diff --git a/lib/nlp_ruby/ttable.rb b/lib/nlp_ruby/ttable.rb deleted file mode 100644 index c0f37be..0000000 --- a/lib/nlp_ruby/ttable.rb +++ /dev/null @@ -1,85 +0,0 @@ -# table['some French string'] = [Array of English strings] -def read_phrase_table fn - table = {} - f = ReadFile.new fn - while raw_rule = f.gets - french, english, features = splitpipe(raw_rule) - feature_map = read_feature_string(features) - if table.has_key? french - table[french] << [english, feature_map ] - else - table[french] = [[english, feature_map]] - end - end - f.close - return table -end - -# FIXME -class Translation - attr_accessor :id, :s, :raw, :f, :score, :rank, :other_score - - def initialize id=nil, raw=nil, s=nil, f=nil, score=nil, rank=nil, other_score=nil - @id = id - @raw = raw - @s = s - @f = f - @score = score - @rank = rank - @other_score = other_score - end - - def from_s t, strip_alignment=true, rank=nil - id, raw, features, score = splitpipe(t, 3) - raw.strip! - @raw = raw - if strip_alignment # the way moses does it - @s = @raw.gsub(/\s*\|\d+-\d+\||\|-?\d+\|\s*/, ' ').gsub(/\s+/, ' ') - @s.strip! - else - @s = raw - end - @id = id.to_i - @f = read_feature_string features - @score = score.to_f - @rank = rank - @other_score = nil - end - - def to_s - [@id, @s, @f.to_kv, @score].join ' ||| ' - end - - def to_s2 - [@rank, @s, @score, @other_score].join ' ||| ' - end -end - -def read_kbest_lists fn, translation_type=Translation - kbest_lists = [] - cur = [] - f = ReadFile.new fn - prev = -1 - c = 0 - id = 0 - while line = f.gets - t = translation_type.new - t.from_s line - c = splitpipe(line)[0].to_i - if c != prev - if cur.size > 0 - kbest_lists << cur - cur = [] - end - prev = c - id = 0 - end - t.id = id - cur << t - id += 1 - end - kbest_lists << cur # last one - f.close - return kbest_lists -end - -- cgit v1.2.3