diff options
author | Patrick Simianer <p@simianer.de> | 2014-06-16 17:44:07 +0200 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2014-06-16 17:44:07 +0200 |
commit | 4059a5d048cb0f72872c98073ef1ce120a30d78c (patch) | |
tree | 4fbff0dc62c5ef3deea0ffdec578e3f2c0ed74b6 /lib/nlp_ruby | |
parent | 912ff6aebcf4f89f9e64b5f59956dbf7d8f624e3 (diff) |
renaming to zipf
Diffstat (limited to 'lib/nlp_ruby')
-rw-r--r-- | lib/nlp_ruby/SparseVector.rb | 172 | ||||
-rw-r--r-- | lib/nlp_ruby/Translation.rb | 72 | ||||
-rw-r--r-- | lib/nlp_ruby/bleu.rb | 130 | ||||
-rw-r--r-- | lib/nlp_ruby/dag.rb | 205 | ||||
-rw-r--r-- | lib/nlp_ruby/fileutil.rb | 88 | ||||
-rw-r--r-- | lib/nlp_ruby/grammar.rb | 122 | ||||
-rw-r--r-- | lib/nlp_ruby/hg.rb | 173 | ||||
-rw-r--r-- | lib/nlp_ruby/misc.rb | 114 | ||||
-rw-r--r-- | lib/nlp_ruby/semirings.rb | 79 | ||||
-rw-r--r-- | lib/nlp_ruby/stringutil.rb | 22 | ||||
-rw-r--r-- | lib/nlp_ruby/tfidf.rb | 38 |
11 files changed, 0 insertions, 1215 deletions
diff --git a/lib/nlp_ruby/SparseVector.rb b/lib/nlp_ruby/SparseVector.rb deleted file mode 100644 index 3096412..0000000 --- a/lib/nlp_ruby/SparseVector.rb +++ /dev/null @@ -1,172 +0,0 @@ -class SparseVector < Hash - - def initialize arg=nil - super - self.default = 0 - if arg.is_a? Array - from_a arg - end - end - - def from_a a - a.each_with_index { |i,j| self[j] = i } - end - - def self.from_a a - v = SparseVector.new - v.from_a a - return v - end - - def from_h h - h.each_pair { |k,v| self[k] = v } - end - - def self.from_h h - v = SparseVector.new - v.from_h h - return v - end - - def from_s s - from_h eval(s) - end - - def self.from_s s - v = SparseVector.new - v.from_s s - return v - end - - def to_kv sep='=', join=' ' - a = [] - self.each_pair { |k,v| - a << "#{k}#{sep}#{v}" - } - return a.join join - end - - def from_kv s - s.split.each { |i| - k,v = i.split('=') - self[k] = v.to_f - } - end - - def self.from_kv s - v = SparseVector.new - v.from_kv s - return v - end - - def from_file fn, sep='=' - f = ReadFile.new(fn) - while line = f.gets - key, value = line.strip.split sep - value = value.to_f - self[key] = value - end - end - - def self.from_file fn, sep='=' - v = SparseVector.new - v.from_file fn, sep - return v - end - - def join_keys other - self.keys + other.keys - end - - def sum - self.values.inject(:+) - end - - def approx_eql? other, p=10**-10 - return false if !other - return false if other.size!=self.size - return false if other.keys.sort!=self.keys.sort - self.keys.each { |k| - return false if (self[k]-other[k]).abs>p - } - return true - end - - def average - self.sum/self.size.to_f - end - - def variance - avg = self.average - var = 0.0 - self.values.each { |i| var += (avg - i)**2 } - return var - end - - def stddev - Math.sqrt self.variance - end - - def dot other - sum = 0.0 - self.each_pair { |k,v| sum += v * other[k] } - return sum - end - - def zeros n - (0).upto(n-1) { |i| self[i] = 0.0 } - end - - def magnitude - Math.sqrt self.values.inject { |sum,i| sum+i**2 } - end - - def cosinus_sim other - self.dot(other)/(self.magnitude*other.magnitude) - end - - def euclidian_dist other - dims = [self.keys, other.keys].flatten.uniq - sum = 0.0 - dims.each { |d| sum += (self[d] - other[d])**2 } - return Math.sqrt(sum) - end - - def + other - new = SparseVector.new - join_keys(other).each { |k| - new[k] = self[k]+other[k] - } - return new - end - - def - other - new = SparseVector.new - join_keys(other).each { |k| - new[k] = self[k]-other[k] - } - return new - end - - def * scalar - raise ArgumentError, "Arg is not numeric #{scalar}" unless scalar.is_a? Numeric - new = SparseVector.new - self.keys.each { |k| - new[k] = self[k] * scalar - } - return new - end - - def self.mean a - mean = SparseVector.new - a.each { |i| - i.each_pair { |k,v| - mean[k] += v - } - } - n = a.size.to_f - mean.each_pair { |k,v| mean[k] = v/n } - return mean - end -end - diff --git a/lib/nlp_ruby/Translation.rb b/lib/nlp_ruby/Translation.rb deleted file mode 100644 index 3759a1d..0000000 --- a/lib/nlp_ruby/Translation.rb +++ /dev/null @@ -1,72 +0,0 @@ -class Translation - attr_accessor :id, :s, :raw, :f, :scores, :rank - - def initialize id=nil, raw=nil, s=nil, f=nil, scores={}, rank=nil - @id = id - @raw = raw - @s = s - @f = f - @scores = scores - @rank = rank - end - - def from_s t, strip_alignment=true, rank=nil - id, raw, features, score = splitpipe(t, 3) - raw.strip! - @raw = raw - if strip_alignment # the way moses does it - @s = @raw.gsub(/\s*\|\d+-\d+\||\|-?\d+\|\s*/, ' ').gsub(/\s+/, ' ') - @s.strip! - else - @s = raw - end - @id = id.to_i - @f = SparseVector.from_kv features - @scores[:decoder] = score.to_f - @rank = rank - end - - def self.from_s s - t = self.new - t.from_s s - return t - end - - def to_s include_features=true - [@id, @s, @f.to_kv('=', ' '), @scores[:decoder]].join(' ||| ') if include_features - [@id, @s, @scores[:decoder]].join(' ||| ') if !include_features - end - - def to_s2 - [@rank, @s, @score, @scores.to_s].join ' ||| ' - end -end - -def read_kbest_lists fn, translation_type=Translation - kbest_lists = [] - cur = [] - f = ReadFile.new fn - prev = -1 - c = 0 - id = 0 - while line = f.gets - t = translation_type.new - t.from_s line - c = splitpipe(line)[0].to_i - if c != prev - if cur.size > 0 - kbest_lists << cur - cur = [] - end - prev = c - id = 0 - end - t.id = id - cur << t - id += 1 - end - kbest_lists << cur # last one - f.close - return kbest_lists -end - diff --git a/lib/nlp_ruby/bleu.rb b/lib/nlp_ruby/bleu.rb deleted file mode 100644 index 56f341b..0000000 --- a/lib/nlp_ruby/bleu.rb +++ /dev/null @@ -1,130 +0,0 @@ -module BLEU - - -class BLEU::NgramCounts - attr_accessor :sum, :clipped, :ref_len, :hyp_len, :n - - def initialize(n) - @n = 0 - @sum = [] - @clipped = [] - @ref_len = 0.0 - @hyp_len = 0.0 - grow(n) - end - - def grow(n) - (n-@n).times { - @sum << 0.0 - @clipped << 0.0 - } - @n = n - end - - def plus_eq(other) - if other.n > @n then grow(other.n) end - 0.upto(other.n-1) { |m| - @sum[m] += other.sum[m] - @clipped[m] += other.clipped[m] - } - @ref_len += other.ref_len - @hyp_len += other.hyp_len - end - - def to_s - return "n=#{n} sum=#{sum} clipped=#{clipped} ref_len=#{ref_len} hyp_len=#{hyp_len}" - end -end - -class BLEU::Ngrams - def initialize - @h_ = {} - @h_.default = 0 - end - - def add(k) - if k.class == Array then k = k.join ' ' end - @h_[k] += 1 - end - - def get_count(k) - if k.class == Array then k = k.join ' ' end - return @h_[k] - end - - def each - @h_.each_pair { |k,v| - yield k.split, v - } - end - - def to_s - @h_.to_s - end -end - -def BLEU::get_counts hypothesis, reference, n, times=1 - p = NgramCounts.new n - r = Ngrams.new - ngrams(reference, n) { |ng| r.add ng } - h = Ngrams.new - ngrams(hypothesis, n) { |ng| h.add ng } - h.each { |ng,count| - sz = ng.size-1 - p.sum[sz] += count * times - p.clipped[sz] += [r.get_count(ng), count].min * times - } - p.ref_len = tokenize(reference.strip).size * times - p.hyp_len = tokenize(hypothesis.strip).size * times - return p -end - -def BLEU::brevity_penalty c, r, smooth=0.0 - return [0.0, 1.0-((r+smooth)/c)].min -end - -def BLEU::bleu counts, n, debug=false - corpus_stats = NgramCounts.new n - counts.each { |i| corpus_stats.plus_eq i } - logbleu = 0.0 - 0.upto(n-1) { |m| - STDERR.write "#{m+1} #{corpus_stats.clipped[m]} / #{corpus_stats.sum[m]}\n" if debug - return 0.0 if corpus_stats.clipped[m] == 0 or corpus_stats.sum == 0 - logbleu += Math.log(corpus_stats.clipped[m]) - Math.log(corpus_stats.sum[m]) - } - logbleu /= n - if debug - STDERR.write "BP #{brevity_penalty(corpus_stats.hyp_len, corpus_stats.ref_len)}\n" - STDERR.write "sum #{Math.exp(sum)}\n" - end - logbleu += brevity_penalty corpus_stats.hyp_len, corpus_stats.ref_len - return Math.exp logbleu -end - -def BLEU::hbleu counts, n, debug=false - (100*bleu(counts, n, debug)).round(3) -end - -def BLEU::per_sentence_bleu hypothesis, reference, n=4, smooth=0.0 - h_ng = {}; r_ng = {} - (1).upto(n) { |i| h_ng[i] = []; r_ng[i] = [] } - ngrams(hypothesis, n) { |i| h_ng[i.size] << i } - ngrams(reference, n) { |i| r_ng[i.size] << i } - m = [n, reference.split.size].min - add = 0.0 - logbleu = 0.0 - (1).upto(m) { |i| - counts_clipped = 0 - counts_sum = h_ng[i].size - h_ng[i].uniq.each { |j| counts_clipped += r_ng[i].count(j) } - add = 1.0 if i >= 2 - logbleu += Math.log(counts_clipped+add) - Math.log(counts_sum+add); - } - logbleu /= m - logbleu += brevity_penalty hypothesis.strip.split.size, reference.strip.split.size, smooth - return Math.exp logbleu -end - - -end # module - diff --git a/lib/nlp_ruby/dag.rb b/lib/nlp_ruby/dag.rb deleted file mode 100644 index 94afa23..0000000 --- a/lib/nlp_ruby/dag.rb +++ /dev/null @@ -1,205 +0,0 @@ -module DAG - -require 'json' - - -class DAG::Node - attr_accessor :label, :outgoing, :incoming, :score, :mark - - def initialize label=nil, outgoing=[], incoming=[], score=nil - @label = label - @outgoing = outgoing - @incoming = incoming - @score = nil - end - - def add_edge head, weight=0 - exit if self==head # no self-cycles! - @outgoing << DAG::Edge.new(self, head, weight) - return @outgoing.last - end - - def to_s - "DAG::Node<label:#{label}, outgoing:#{outgoing.size}, incoming:#{incoming.size}>" - end - - def repr - "#{to_s} #{@score} out:#{@outgoing} in:[#{@incoming.map{|e| e.to_s}.join ', '}]" - end -end - -class DAG::Edge - attr_accessor :tail, :head, :weight, :mark - - def initialize tail=nil, head=nil, weight=0 - @tail = tail - @head = head - @weight = weight - @mark = false # did we already follow this edge? -- for topological sorting - end - - def to_s - s = "DAG::Edge<#{@tail} ->[#{weight}] #{@head}" - s += " x" if @mark - s += ">" - s - end -end - -# depth-first search -# w/o markings as we do not have cycles -def DAG::dfs n, target_label - return n if n.label==target_label # assumes uniq labels! - stack = n.outgoing.map { |i| i.head } - while !stack.empty? - m = stack.pop - return DAG::dfs m, target_label - end - return nil -end - -# breadth-first search -# w/o markings as we do not have cycles -def DAG::bfs n, target_label - queue = [n] - while !queue.empty? - m = queue.shift - return m if m.label==target_label - m.outgoing.each { |e| queue << e.head } - end - return nil -end - -# topological sort -def DAG::topological_sort graph - sorted = [] - s = graph.reject { |n| !n.incoming.empty? } - while !s.empty? - sorted << s.shift - sorted.last.outgoing.each { |e| - e.mark = true - s << e.head if e.head.incoming.reject{|f| f.mark}.empty? - } - end - return sorted -end - -# initialize graph scores with semiring One -def DAG::init graph, semiring, source_node - graph.each {|n| n.score=semiring.null} - source_node.score = semiring.one -end - -# viterbi -def DAG::viterbi graph, semiring=ViterbiSemiring, source_node - toposorted = DAG::topological_sort(graph) - DAG::init(graph, semiring, source_node) - toposorted.each { |n| - n.incoming.each { |e| - # update - n.score = \ - semiring.add.call(n.score, \ - semiring.multiply.call(e.tail.score, e.weight) - ) - } - } -end - -# forward viterbi -def DAG::viterbi_forward graph, semiring=ViterbiSemiring, source_node - toposorted = DAG::topological_sort(graph) - DAG::init(graph, semiring, source_node) - toposorted.each { |n| - n.outgoing.each { |e| - e.head.score = \ - semiring.add.call(e.head.score, \ - semiring.multiply.call(n.score, e.weight) - ) - } - } -end - -# Dijkstra algorithm -# for A*-search we would need an optimistic estimate of -# future cost at each node -def DAG::dijkstra graph, semiring=RealSemiring.new, source_node - DAG::init(graph, semiring, source_node) - q = PriorityQueue.new graph - while !q.empty? - n = q.pop - n.outgoing.each { |e| - e.head.score = \ - semiring.add.call(e.head.score, \ - semiring.multiply.call(n.score, e.weight)) - q.sort! - } - end -end - -# Bellman-Ford algorithm -def DAG::bellman_ford(graph, semiring=RealSemiring.new, source_node) - DAG::init(graph, semiring, source_node) - edges = [] - graph.each { |n| edges |= n.outgoing } - # relax edges - (graph.size-1).times{ |i| - edges.each { |e| - e.head.score = \ - semiring.add.call(e.head.score, \ - semiring.multiply.call(e.tail.score, e.weight)) - } - } - # we do not allow cycles (negative or positive) -end - -# Floyd algorithm -def DAG::floyd(graph, semiring=nil) - dist_matrix = [] - graph.each_index { |i| - dist_matrix << [] - graph.each_index { |j| - val = 1.0/0.0 - val = 0.0 if i==j - dist_matrix.last << val - } - } - edges = [] - graph.each { |n| edges |= n.outgoing } - edges.each { |e| - dist_matrix[graph.index(e.tail)][graph.index(e.head)] = e.weight - } - 0.upto(graph.size-1) { |k| - 0.upto(graph.size-1) { |i| - 0.upto(graph.size-1) { |j| - if dist_matrix[i][k] + dist_matrix[k][j] < dist_matrix[i][j] - dist_matrix [i][j] = dist_matrix[i][k] + dist_matrix[k][j] - end - } - } - } - return dist_matrix -end - - -# returns a list of nodes (graph) and a hash for finding -# nodes by their label (these need to be unique!) -def DAG::read_graph_from_json fn, semiring=RealSemiring.new - graph = [] - nodes_by_label = {} - h = JSON.parse File.new(fn).read - h['nodes'].each { |i| - n = DAG::Node.new i['label'] - graph << n - nodes_by_label[n.label] = n - } - h['edges'].each { |i| - n = nodes_by_label[i['tail']] - a = n.add_edge(nodes_by_label[i['head']], semiring.convert.call(i['weight'].to_f)) - nodes_by_label[i['head']].incoming << a - } - return graph, nodes_by_label -end - - -end # module - diff --git a/lib/nlp_ruby/fileutil.rb b/lib/nlp_ruby/fileutil.rb deleted file mode 100644 index eb69136..0000000 --- a/lib/nlp_ruby/fileutil.rb +++ /dev/null @@ -1,88 +0,0 @@ -require 'zlib' - - -class ReadFile - - def initialize fn, encoding='utf-8' - if fn.split('.').last == 'gz' - @f = Zlib::GzipReader.new(File.new(fn, 'rb'), :external_encoding=>encoding) - elsif fn == '-' - @f = STDIN - STDIN.set_encoding encoding - else - @f = File.new fn, 'r' - @f.set_encoding encoding - end - end - - def gets - @f.gets { |line| yield line } - end - - def readlines - @f.readlines - end - - def self.readlines fn, encoding='utf-8' - f = ReadFile.new fn, encoding - r = f.readlines - f.close - return r - end - - def readlines_strip - self.readlines.map{ |i| i.strip } - end - - def self.readlines_strip fn, encoding='utf-8' - f = ReadFile.new fn, encoding - r = f.readlines_strip - f.close - return r - end - - def read - @f.read - end - - def self.read fn, encoding='utf-8' - f = ReadFile.new fn, encoding - r = f.read - f.close - return r - end - - def close - @f.close if @f!=STDIN - end -end - -class WriteFile - - def initialize fn, encoding='utf-8' - if fn.split('.').last == 'gz' - @f = Zlib::GzipWriter.new(File.new(fn, 'wb+'), :external_encoding=>encoding) - elsif fn == '-' - @f = STDOUT - STDOUT.set_encoding encoding - else - @f = File.new fn, 'w+' - @f.set_encoding encoding - end - end - - def write s - @f.write s - end - - def self.write s, fn, encoding='utf-8' - f = WriteFile.new fn, encoding - f.write s - f.close - end - - def close - @f.close if @f!=STDIN - end -end - diff --git a/lib/nlp_ruby/grammar.rb b/lib/nlp_ruby/grammar.rb deleted file mode 100644 index 7bd8fe6..0000000 --- a/lib/nlp_ruby/grammar.rb +++ /dev/null @@ -1,122 +0,0 @@ -module Grammar - -class T - attr_accessor :word - - def initialize word - @word = word - end - - def to_s - "T<#{@word}>" - end -end - -class NT - attr_accessor :symbol, :index, :span - - def initialize symbol, index=0 - @symbol = symbol - @index = index - @span = Span.new - end - - def to_s - "NT(#{@span.left},#{@span.right})<#{@symbol},#{@index}>" - end -end - -class Rule - attr_accessor :lhs, :rhs, :e - - def initialize lhs=nil, rhs=[], e='' - @lhs = lhs - @rhs = rhs - @e = e - end - - def to_s - "#{lhs} -> #{rhs.map{ |i| i.to_s }.join ' '} [arity=#{arity}] ||| #{@e}" - end - - def arity - rhs.select { |i| i.class == NT }.size - end - - def from_s s - _ = splitpipe s, 3 - @lhs = NT.new _[0].strip.gsub!(/(\[|\])/, "") - _[1].split.each { |x| - x.strip! - if x[0]=='[' && x[x.size-1] == ']' - @rhs << NT.new(x.gsub!(/(\[|\])/, "").split(',')[0]) - else - @rhs << T.new(x) - end - } - @e = _[2] - end - - def self.from_s s - r = self.new - r.from_s s - return r - end -end - -class Span - attr_accessor :left, :right - - def initialize left=nil, right=nil - @left = left - @right = right - end -end - -class Grammar - attr_accessor :rules, :startn, :startt, :flat - - def initialize fn - @rules = []; @startn = []; @startt = [] ;@flat = [] - ReadFile.readlines_strip(fn).each_with_index { |s,i| - STDERR.write '.'; STDERR.write " #{i+1}\n" if (i+1)%80==0 - @rules << Rule.from_s(s) - if @rules.last.rhs.first.class == NT - @startn << @rules.last - else - if rules.last.arity == 0 - @flat << @rules.last - else - @startt << @rules.last - end - end - } - STDERR.write "\n" - end - - def to_s - s = '' - @rules.each { |r| s += r.to_s+"\n" } - return s - end - - def add_glue_rules - @rules.map { |r| r.lhs.symbol }.select { |s| s != 'S' }.uniq.each { |symbol| - @rules << Rule.new(NT.new('S'), [NT.new(symbol)]) - @startn << @rules.last - @rules << Rule.new(NT.new('S'), [NT.new('S'), NT.new('X')]) - @startn << @rules.last - } - end - - def add_pass_through_rules s - s.each { |word| - @rules << Rule.new(NT.new('X'), [T.new(word)]) - @flat << @rules.last - } - end -end - - -end # module - diff --git a/lib/nlp_ruby/hg.rb b/lib/nlp_ruby/hg.rb deleted file mode 100644 index b8b147e..0000000 --- a/lib/nlp_ruby/hg.rb +++ /dev/null @@ -1,173 +0,0 @@ -#!/usr/bin/env ruby - -require 'nlp_ruby' -require 'json' - - -module HG - - -class HG::Node - attr_accessor :label, :cat, :outgoing, :incoming, :score - - def initialize label=nil, cat=nil, outgoing=[], incoming=[], score=nil - @label = label - @cat = cat - @outgoing = outgoing - @incoming = incoming - @score = nil - end - - def to_s - "Node<label:\"#{@label}\", cat:\"#{@cat}\", outgoing:#{@outgoing.size}, incoming:#{@incoming.size}>" - end -end - -class HG::Hypergraph - attr_accessor :nodes, :edges - - def initialize nodes=[], edges=[] - @nodes = nodes - @edges = edges - end - - def arity - @edges.map { |e| e.arity }.max - end - - def to_s - "Hypergraph<nodes:[#{@nodes.to_s}], edges:[#{@edges.to_s}], arity:#{arity}>" - end -end - -class HG::Hyperedge - attr_accessor :head, :tails, :weight, :f, :mark, :rule, :left, :right - - def initialize head=nil, tails=[], weight=0.0, f={} - @head = head - @tails = tails - @weight = weight - @f = f - @mark = 0 - end - - def arity - return @tails.size - end - - def marked? - arity == @mark - end - - def to_s - "Hyperedge<head:\"#{@head.label}\", \"tails:#{@tails.map{|n|n.label}}, arity:#{arity}, weight:#{@weight}, f:#{f.to_s}, mark:#{@mark}>" - end -end - -def HG::topological_sort nodes - sorted = [] - s = nodes.reject { |n| !n.incoming.empty? } - while !s.empty? - sorted << s.shift - sorted.last.outgoing.each { |e| - next if e.marked? - e.mark += 1 - s << e.head if e.head.incoming.reject{ |f| f.mark==f.arity }.empty? - } - end - return sorted -end - -def HG::init nodes, semiring, root - nodes.each { |n| n.score=semiring.null } - root.score = semiring.one -end - -def HG::viterbi hypergraph, root, semiring=ViterbiSemiring.new - toposorted = topological_sort hypergraph.nodes - init toposorted, semiring, root - toposorted.each { |n| - n.incoming.each { |e| - s = semiring.one - e.tails.each { |m| - s = semiring.multiply.call(s, m.score) - } - n.score = semiring.add.call(n.score, semiring.multiply.call(s, e.weight)) - } - } -end - -def HG::viterbi_path hypergraph, root, semiring=ViterbiSemiring.new - toposorted = topological_sort hypergraph.nodes - init toposorted, semiring, root - best_path = [] - toposorted.each { |n| - best_edge = nil - n.incoming.each { |e| - s = semiring.one - e.tails.each { |m| - s = semiring.multiply.call(s, m.score) - } - if n.score < semiring.multiply.call(s, e.weight) # ViterbiSemiring add - best_edge = e - end - n.score = semiring.add.call(n.score, semiring.multiply.call(s, e.weight)) - } - best_path << best_edge - } - return best_path, toposorted.last.score -end - -def HG::read_hypergraph_from_json fn, semiring=RealSemiring.new, log_weights=false - nodes = [] - edges = [] - nodes_by_label = {} - nodes_by_index = [] - h = JSON.parse File.new(fn).read - w = SparseVector.from_h h['weights'] - h['nodes'].each { |i| - n = Node.new i['label'], i['cat'] - nodes << n - nodes_by_label[n.label] = n - nodes_by_index << n - } - h['edges'].each { |i| - e = Hyperedge.new(nodes_by_label[i['head']], \ - i['tails'].map{|j| nodes_by_label[j]}.to_a, \ - semiring.convert.call(i['weight'].to_f), \ - {}) - e.f = SparseVector.from_h i['f'] - if log_weights - e.weight = Math.exp(w.dot(e.f)) - else - e.weight = w.dot(e.f) - end - e.tails.each { |m| - m.outgoing << e - } - e.head.incoming << e - edges << e - } - return Hypergraph.new(nodes, edges), nodes_by_label, nodes_by_index -end - -def HG::all_paths hypergraph, root, semiring=ViterbiSemiring.new - toposorted = topological_sort hypergraph.nodes - paths = [[]] - toposorted.each { |n| - next if n.incoming.empty? - new_paths = [] - while !paths.empty? - p = paths.pop - n.incoming.each { |e| - new_paths << p+[e] - } - end - paths = new_paths - } - return paths -end - - -end #module - diff --git a/lib/nlp_ruby/misc.rb b/lib/nlp_ruby/misc.rb deleted file mode 100644 index 0319a5f..0000000 --- a/lib/nlp_ruby/misc.rb +++ /dev/null @@ -1,114 +0,0 @@ -require 'timeout' - - -class Array - def max_index - self.index(self.max) - end - - def is_subset_of? other - self.each { |i| - if other.include? i - return false - end - } - return true - end - - def sum - self.inject(:+) - end - - def mean - self.sum.to_f/self.size - end -end - -class String - - def downcase? - self[/[[:lower:]]/] - end -end - -class PriorityQueue -# This assumes that elements in the queue -# have a numerical member named 'score'. - - def initialize a=Array.new - @queue = Array.new a - sort! - end - - def sort! - @queue.sort_by! { |i| -i.score } - end - - def pop - @queue.pop - end - - def push i - @queue << i - sort! - end - - def empty? - @queue.empty? - end -end - -def spawn_with_timeout cmd, t=4, ignore_fail=false, debug=false - STDERR.write cmd+"\n" if debug - pipe_in, pipe_out = IO.pipe - pid = Process.spawn(cmd, :out => pipe_out) - begin - Timeout.timeout(t) { Process.wait pid } - rescue Timeout::Error - Process.kill('TERM', pid) if !ignore_fail - end - pipe_out.close - return pipe_in.read -end - -def read_phrase_table fn - table = {} - f = ReadFile.new fn - while raw_rule = f.gets - french, english, features = splitpipe(raw_rule) - feature_map = SparseVector.from_kv features - if table.has_key? french - table[french] << [english, feature_map ] - else - table[french] = [[english, feature_map]] - end - end - f.close - return table -end - -def cdec_kbest cdec_bin, input, ini, weights, k, unique=true - require 'open3' - cmd = "echo \"#{input}\" | #{cdec_bin} -c #{ini} -w #{weights} -k #{k}" - cmd += " -r" if unique - o,_ = Open3.capture2 "#{cmd} 2>/dev/null" - a = []; j = -1 - o.split("\n").map{ |i| j+=1; t=Translation.new; t.from_s(i, false, j); a << t } - return a -end - -def read_config fn - f = ReadFile.new fn - cfg = {} - while line = f.gets - line.strip! - next if /^\s*$/.match line - next if line[0]=='#' - content = line.split('#', 2).first - k, v = content.split(/\s*=\s*/, 2) - k.strip!; v.strip! - cfg[k] = v - end - return cfg -end - diff --git a/lib/nlp_ruby/semirings.rb b/lib/nlp_ruby/semirings.rb deleted file mode 100644 index fda4683..0000000 --- a/lib/nlp_ruby/semirings.rb +++ /dev/null @@ -1,79 +0,0 @@ -# Semirings for directed acyclic graphs (dags) (also directed hypergraphs), -# as described in: -# 'Dynamic Programming Algorithms in -# Semiring and Hypergraph Frameworks' (Liang Huang) -class Semiring - attr_accessor :add, :multiply, :one, :null, :convert -end - -class BooleanSemiring < Semiring - def initialize - @add = Proc.new { |a,b| a||b } - @multiply = Proc.new { |a,b| a&&b } - @one = true - @null = false - @convert = Proc.new { |v| true && v!=0 } - end -end - -class ViterbiSemiring < Semiring - def initialize - @add = Proc.new { |a,b| [a,b].max } - @multiply = Proc.new { |a,b| a*b } - @one = 1.0 - @null = 0.0 - @convert = Proc.new { |v| v } - end -end - -class ViterbiLogSemiring < Semiring - def initialize - @add = Proc.new { |a,b| [a,b].max } - @multiply = Proc.new { |a,b| a+b } - @one = 0.0 - @null = -1.0/0.0 - @convert = Proc.new { |v| v } - end -end - -class InsideSemiring < Semiring - def initialize - @add = Proc.new { |a,b| a+b } - @multiply = Proc.new { |a,b| a*b } - @one = 1.0 - @null = 0.0 - @convert = Proc.new { |v| v } - end -end - -class RealSemiring < Semiring - def initialize - @add = Proc.new { |a,b| [a,b].min } - @multiply = Proc.new { |a,b| a+b } - @one = 0.0 - @null = 1.0/0.0 - @convert = Proc.new { |v| v } - end -end - -# for longest/worst paths -class RealxSemiring < Semiring - def initialize - @add = Proc.new { |a,b| [a,b].max } - @multiply = Proc.new { |a,b| a+b } - @one = -1.0/0.0 - @null = 0.0 - @convert = Proc.new { |v| v } - end -end - -class CountingSemiring < Semiring - def initialize - @add = Proc.new { |a,b| a+b } - @multiply = Proc.new { |a,b| a*b } - @one = 1.0 - @null = 0.0 - @convert = Proc.new { |v| if v!=0 then 1 else 0 end } - end -end - diff --git a/lib/nlp_ruby/stringutil.rb b/lib/nlp_ruby/stringutil.rb deleted file mode 100644 index aa9be00..0000000 --- a/lib/nlp_ruby/stringutil.rb +++ /dev/null @@ -1,22 +0,0 @@ -def tokenize s - s.strip.split -end - -def ngrams(s, n, fix=false) - a = tokenize s - a.each_with_index { |tok, i| - tok.strip! - 0.upto([n-1, a.size-i-1].min) { |m| - yield a[i..i+m] if !fix||(fix&&a[i..i+m].size==n) - } - } -end - -def bag_of_words s, stopwords=[] - s.strip.split.uniq.sort.reject{ |w| stopwords.include? w } -end - -def splitpipe s, n=3 - s.strip.split("|"*n) -end - diff --git a/lib/nlp_ruby/tfidf.rb b/lib/nlp_ruby/tfidf.rb deleted file mode 100644 index 13a40a3..0000000 --- a/lib/nlp_ruby/tfidf.rb +++ /dev/null @@ -1,38 +0,0 @@ -module TFIDF - - -# returns key='raw frequency' for an -# array-like object -def TFIDF::tf array, stopwords=[] - v = {}; v.default = 0 - array.uniq.each { |i| - next if stopwords.include? i - v[i] = array.count(i).to_f - } - return v -end - -# smoothes raw frequencies of tf() in-place -# a is a smoothing term -def TFIDF::ntf hash, a=0.4 - max = hash.values.max.to_f - hash.each_pair { |k,v| - hash[k] = a + (1-a)*(v/max) - } -end - -# returns idf value for each word in a vocabulary -def TFIDF::idf list_of_hashes - vocab = list_of_hashes.values.flatten.uniq - n = list_of_hashes.size.to_f - idf = {} - vocab.each { |i| - df = list_of_hashes.values.flatten.count i - idf[i] = Math.log(n/df) - } - return idf -end - - -end #module - |