From 4059a5d048cb0f72872c98073ef1ce120a30d78c Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Mon, 16 Jun 2014 17:44:07 +0200 Subject: renaming to zipf --- Makefile | 14 +-- README.md | 6 +- lib/nlp_ruby.rb | 18 ---- lib/nlp_ruby/SparseVector.rb | 172 ------------------------------------ lib/nlp_ruby/Translation.rb | 72 --------------- lib/nlp_ruby/bleu.rb | 130 --------------------------- lib/nlp_ruby/dag.rb | 205 ------------------------------------------- lib/nlp_ruby/fileutil.rb | 88 ------------------- lib/nlp_ruby/grammar.rb | 122 ------------------------- lib/nlp_ruby/hg.rb | 173 ------------------------------------ lib/nlp_ruby/misc.rb | 114 ------------------------ lib/nlp_ruby/semirings.rb | 79 ----------------- lib/nlp_ruby/stringutil.rb | 22 ----- lib/nlp_ruby/tfidf.rb | 38 -------- lib/zipf.rb | 18 ++++ lib/zipf/SparseVector.rb | 172 ++++++++++++++++++++++++++++++++++++ lib/zipf/Translation.rb | 72 +++++++++++++++ lib/zipf/bleu.rb | 130 +++++++++++++++++++++++++++ lib/zipf/dag.rb | 205 +++++++++++++++++++++++++++++++++++++++++++ lib/zipf/fileutil.rb | 88 +++++++++++++++++++ lib/zipf/grammar.rb | 123 ++++++++++++++++++++++++++ lib/zipf/hg.rb | 173 ++++++++++++++++++++++++++++++++++++ lib/zipf/misc.rb | 114 ++++++++++++++++++++++++ lib/zipf/semirings.rb | 81 +++++++++++++++++ lib/zipf/stringutil.rb | 22 +++++ lib/zipf/tfidf.rb | 38 ++++++++ nlp_ruby.gemspec | 13 --- test/test_dag.rb | 2 +- test/test_hg.rb | 2 +- zipf.gemspec | 13 +++ 30 files changed, 1261 insertions(+), 1258 deletions(-) delete mode 100755 lib/nlp_ruby.rb delete mode 100644 lib/nlp_ruby/SparseVector.rb delete mode 100644 lib/nlp_ruby/Translation.rb delete mode 100644 lib/nlp_ruby/bleu.rb delete mode 100644 lib/nlp_ruby/dag.rb delete mode 100644 lib/nlp_ruby/fileutil.rb delete mode 100644 lib/nlp_ruby/grammar.rb delete mode 100644 lib/nlp_ruby/hg.rb delete mode 100644 lib/nlp_ruby/misc.rb delete mode 100644 lib/nlp_ruby/semirings.rb delete mode 100644 lib/nlp_ruby/stringutil.rb delete mode 100644 lib/nlp_ruby/tfidf.rb create mode 100755 lib/zipf.rb create mode 100644 lib/zipf/SparseVector.rb create mode 100644 lib/zipf/Translation.rb create mode 100644 lib/zipf/bleu.rb create mode 100644 lib/zipf/dag.rb create mode 100644 lib/zipf/fileutil.rb create mode 100644 lib/zipf/grammar.rb create mode 100644 lib/zipf/hg.rb create mode 100644 lib/zipf/misc.rb create mode 100644 lib/zipf/semirings.rb create mode 100644 lib/zipf/stringutil.rb create mode 100644 lib/zipf/tfidf.rb delete mode 100644 nlp_ruby.gemspec create mode 100644 zipf.gemspec diff --git a/Makefile b/Makefile index abebe27..d8ae75b 100644 --- a/Makefile +++ b/Makefile @@ -1,19 +1,19 @@ -version := $$(grep s.version nlp_ruby.gemspec | awk '{print $$3}' | sed "s|'||g") +version := $$(grep s.version zipf.gemspec | awk '{print $$3}' | sed "s|'||g") -all: lib/nlp_ruby.rb lib/nlp_ruby/bleu.rb lib/nlp_ruby/dag.rb lib/nlp_ruby/fileutil.rb lib/nlp_ruby/misc.rb lib/nlp_ruby/semirings.rb lib/nlp_ruby/SparseVector.rb lib/nlp_ruby/stringutil.rb lib/nlp_ruby/tfidf.rb lib/nlp_ruby/Translation.rb - gem build nlp_ruby.gemspec +all: lib/zipf.rb lib/zipf/bleu.rb lib/zipf/dag.rb lib/zipf/fileutil.rb lib/zipf/misc.rb lib/zipf/semirings.rb lib/zipf/SparseVector.rb lib/zipf/stringutil.rb lib/zipf/tfidf.rb lib/zipf/Translation.rb + gem build zipf.gemspec install: - gem install nlp_ruby-$(version).gem + gem install zipf-$(version).gem clean: - rm nlp_ruby-$(version).gem - gem uninstall nlp_ruby -v $(version) + rm zipf-$(version).gem + gem uninstall zipf -v $(version) rake_test: rake test publish: - gem push nlp_ruby-$(version).gem + gem push zipf-$(version).gem diff --git a/README.md b/README.md index cc8d3ef..ff5bd3d 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ -nlp_ruby -======== +zipf +==== My little NLP library, supposed to make _my_ work a little easier and less redundant. -The .gem can be found here: https://rubygems.org/gems/nlp_ruby +The .gem can be found here: https://rubygems.org/gems/zipf bleu.rb : BLEU implementation, also per-sentence-BLEU dag.rb : implementation of a directed acyclic graph and various algorithms diff --git a/lib/nlp_ruby.rb b/lib/nlp_ruby.rb deleted file mode 100755 index 0e26e97..0000000 --- a/lib/nlp_ruby.rb +++ /dev/null @@ -1,18 +0,0 @@ -#!/usr/bin/env ruby - -require 'nlp_ruby/stringutil' -require 'nlp_ruby/fileutil' -require 'nlp_ruby/SparseVector' -require 'nlp_ruby/tfidf' -require 'nlp_ruby/Translation' -require 'nlp_ruby/dag' -require 'nlp_ruby/semirings' -require 'nlp_ruby/bleu' -require 'nlp_ruby/misc' -require 'nlp_ruby/hg' -require 'nlp_ruby/grammar' - -STDIN.set_encoding 'utf-8' -STDOUT.set_encoding 'utf-8' -STDERR.set_encoding 'utf-8' - diff --git a/lib/nlp_ruby/SparseVector.rb b/lib/nlp_ruby/SparseVector.rb deleted file mode 100644 index 3096412..0000000 --- a/lib/nlp_ruby/SparseVector.rb +++ /dev/null @@ -1,172 +0,0 @@ -class SparseVector < Hash - - def initialize arg=nil - super - self.default = 0 - if arg.is_a? Array - from_a arg - end - end - - def from_a a - a.each_with_index { |i,j| self[j] = i } - end - - def self.from_a a - v = SparseVector.new - v.from_a a - return v - end - - def from_h h - h.each_pair { |k,v| self[k] = v } - end - - def self.from_h h - v = SparseVector.new - v.from_h h - return v - end - - def from_s s - from_h eval(s) - end - - def self.from_s s - v = SparseVector.new - v.from_s s - return v - end - - def to_kv sep='=', join=' ' - a = [] - self.each_pair { |k,v| - a << "#{k}#{sep}#{v}" - } - return a.join join - end - - def from_kv s - s.split.each { |i| - k,v = i.split('=') - self[k] = v.to_f - } - end - - def self.from_kv s - v = SparseVector.new - v.from_kv s - return v - end - - def from_file fn, sep='=' - f = ReadFile.new(fn) - while line = f.gets - key, value = line.strip.split sep - value = value.to_f - self[key] = value - end - end - - def self.from_file fn, sep='=' - v = SparseVector.new - v.from_file fn, sep - return v - end - - def join_keys other - self.keys + other.keys - end - - def sum - self.values.inject(:+) - end - - def approx_eql? other, p=10**-10 - return false if !other - return false if other.size!=self.size - return false if other.keys.sort!=self.keys.sort - self.keys.each { |k| - return false if (self[k]-other[k]).abs>p - } - return true - end - - def average - self.sum/self.size.to_f - end - - def variance - avg = self.average - var = 0.0 - self.values.each { |i| var += (avg - i)**2 } - return var - end - - def stddev - Math.sqrt self.variance - end - - def dot other - sum = 0.0 - self.each_pair { |k,v| sum += v * other[k] } - return sum - end - - def zeros n - (0).upto(n-1) { |i| self[i] = 0.0 } - end - - def magnitude - Math.sqrt self.values.inject { |sum,i| sum+i**2 } - end - - def cosinus_sim other - self.dot(other)/(self.magnitude*other.magnitude) - end - - def euclidian_dist other - dims = [self.keys, other.keys].flatten.uniq - sum = 0.0 - dims.each { |d| sum += (self[d] - other[d])**2 } - return Math.sqrt(sum) - end - - def + other - new = SparseVector.new - join_keys(other).each { |k| - new[k] = self[k]+other[k] - } - return new - end - - def - other - new = SparseVector.new - join_keys(other).each { |k| - new[k] = self[k]-other[k] - } - return new - end - - def * scalar - raise ArgumentError, "Arg is not numeric #{scalar}" unless scalar.is_a? Numeric - new = SparseVector.new - self.keys.each { |k| - new[k] = self[k] * scalar - } - return new - end - - def self.mean a - mean = SparseVector.new - a.each { |i| - i.each_pair { |k,v| - mean[k] += v - } - } - n = a.size.to_f - mean.each_pair { |k,v| mean[k] = v/n } - return mean - end -end - diff --git a/lib/nlp_ruby/Translation.rb b/lib/nlp_ruby/Translation.rb deleted file mode 100644 index 3759a1d..0000000 --- a/lib/nlp_ruby/Translation.rb +++ /dev/null @@ -1,72 +0,0 @@ -class Translation - attr_accessor :id, :s, :raw, :f, :scores, :rank - - def initialize id=nil, raw=nil, s=nil, f=nil, scores={}, rank=nil - @id = id - @raw = raw - @s = s - @f = f - @scores = scores - @rank = rank - end - - def from_s t, strip_alignment=true, rank=nil - id, raw, features, score = splitpipe(t, 3) - raw.strip! - @raw = raw - if strip_alignment # the way moses does it - @s = @raw.gsub(/\s*\|\d+-\d+\||\|-?\d+\|\s*/, ' ').gsub(/\s+/, ' ') - @s.strip! - else - @s = raw - end - @id = id.to_i - @f = SparseVector.from_kv features - @scores[:decoder] = score.to_f - @rank = rank - end - - def self.from_s s - t = self.new - t.from_s s - return t - end - - def to_s include_features=true - [@id, @s, @f.to_kv('=', ' '), @scores[:decoder]].join(' ||| ') if include_features - [@id, @s, @scores[:decoder]].join(' ||| ') if !include_features - end - - def to_s2 - [@rank, @s, @score, @scores.to_s].join ' ||| ' - end -end - -def read_kbest_lists fn, translation_type=Translation - kbest_lists = [] - cur = [] - f = ReadFile.new fn - prev = -1 - c = 0 - id = 0 - while line = f.gets - t = translation_type.new - t.from_s line - c = splitpipe(line)[0].to_i - if c != prev - if cur.size > 0 - kbest_lists << cur - cur = [] - end - prev = c - id = 0 - end - t.id = id - cur << t - id += 1 - end - kbest_lists << cur # last one - f.close - return kbest_lists -end - diff --git a/lib/nlp_ruby/bleu.rb b/lib/nlp_ruby/bleu.rb deleted file mode 100644 index 56f341b..0000000 --- a/lib/nlp_ruby/bleu.rb +++ /dev/null @@ -1,130 +0,0 @@ -module BLEU - - -class BLEU::NgramCounts - attr_accessor :sum, :clipped, :ref_len, :hyp_len, :n - - def initialize(n) - @n = 0 - @sum = [] - @clipped = [] - @ref_len = 0.0 - @hyp_len = 0.0 - grow(n) - end - - def grow(n) - (n-@n).times { - @sum << 0.0 - @clipped << 0.0 - } - @n = n - end - - def plus_eq(other) - if other.n > @n then grow(other.n) end - 0.upto(other.n-1) { |m| - @sum[m] += other.sum[m] - @clipped[m] += other.clipped[m] - } - @ref_len += other.ref_len - @hyp_len += other.hyp_len - end - - def to_s - return "n=#{n} sum=#{sum} clipped=#{clipped} ref_len=#{ref_len} hyp_len=#{hyp_len}" - end -end - -class BLEU::Ngrams - def initialize - @h_ = {} - @h_.default = 0 - end - - def add(k) - if k.class == Array then k = k.join ' ' end - @h_[k] += 1 - end - - def get_count(k) - if k.class == Array then k = k.join ' ' end - return @h_[k] - end - - def each - @h_.each_pair { |k,v| - yield k.split, v - } - end - - def to_s - @h_.to_s - end -end - -def BLEU::get_counts hypothesis, reference, n, times=1 - p = NgramCounts.new n - r = Ngrams.new - ngrams(reference, n) { |ng| r.add ng } - h = Ngrams.new - ngrams(hypothesis, n) { |ng| h.add ng } - h.each { |ng,count| - sz = ng.size-1 - p.sum[sz] += count * times - p.clipped[sz] += [r.get_count(ng), count].min * times - } - p.ref_len = tokenize(reference.strip).size * times - p.hyp_len = tokenize(hypothesis.strip).size * times - return p -end - -def BLEU::brevity_penalty c, r, smooth=0.0 - return [0.0, 1.0-((r+smooth)/c)].min -end - -def BLEU::bleu counts, n, debug=false - corpus_stats = NgramCounts.new n - counts.each { |i| corpus_stats.plus_eq i } - logbleu = 0.0 - 0.upto(n-1) { |m| - STDERR.write "#{m+1} #{corpus_stats.clipped[m]} / #{corpus_stats.sum[m]}\n" if debug - return 0.0 if corpus_stats.clipped[m] == 0 or corpus_stats.sum == 0 - logbleu += Math.log(corpus_stats.clipped[m]) - Math.log(corpus_stats.sum[m]) - } - logbleu /= n - if debug - STDERR.write "BP #{brevity_penalty(corpus_stats.hyp_len, corpus_stats.ref_len)}\n" - STDERR.write "sum #{Math.exp(sum)}\n" - end - logbleu += brevity_penalty corpus_stats.hyp_len, corpus_stats.ref_len - return Math.exp logbleu -end - -def BLEU::hbleu counts, n, debug=false - (100*bleu(counts, n, debug)).round(3) -end - -def BLEU::per_sentence_bleu hypothesis, reference, n=4, smooth=0.0 - h_ng = {}; r_ng = {} - (1).upto(n) { |i| h_ng[i] = []; r_ng[i] = [] } - ngrams(hypothesis, n) { |i| h_ng[i.size] << i } - ngrams(reference, n) { |i| r_ng[i.size] << i } - m = [n, reference.split.size].min - add = 0.0 - logbleu = 0.0 - (1).upto(m) { |i| - counts_clipped = 0 - counts_sum = h_ng[i].size - h_ng[i].uniq.each { |j| counts_clipped += r_ng[i].count(j) } - add = 1.0 if i >= 2 - logbleu += Math.log(counts_clipped+add) - Math.log(counts_sum+add); - } - logbleu /= m - logbleu += brevity_penalty hypothesis.strip.split.size, reference.strip.split.size, smooth - return Math.exp logbleu -end - - -end # module - diff --git a/lib/nlp_ruby/dag.rb b/lib/nlp_ruby/dag.rb deleted file mode 100644 index 94afa23..0000000 --- a/lib/nlp_ruby/dag.rb +++ /dev/null @@ -1,205 +0,0 @@ -module DAG - -require 'json' - - -class DAG::Node - attr_accessor :label, :outgoing, :incoming, :score, :mark - - def initialize label=nil, outgoing=[], incoming=[], score=nil - @label = label - @outgoing = outgoing - @incoming = incoming - @score = nil - end - - def add_edge head, weight=0 - exit if self==head # no self-cycles! - @outgoing << DAG::Edge.new(self, head, weight) - return @outgoing.last - end - - def to_s - "DAG::Node" - end - - def repr - "#{to_s} #{@score} out:#{@outgoing} in:[#{@incoming.map{|e| e.to_s}.join ', '}]" - end -end - -class DAG::Edge - attr_accessor :tail, :head, :weight, :mark - - def initialize tail=nil, head=nil, weight=0 - @tail = tail - @head = head - @weight = weight - @mark = false # did we already follow this edge? -- for topological sorting - end - - def to_s - s = "DAG::Edge<#{@tail} ->[#{weight}] #{@head}" - s += " x" if @mark - s += ">" - s - end -end - -# depth-first search -# w/o markings as we do not have cycles -def DAG::dfs n, target_label - return n if n.label==target_label # assumes uniq labels! - stack = n.outgoing.map { |i| i.head } - while !stack.empty? - m = stack.pop - return DAG::dfs m, target_label - end - return nil -end - -# breadth-first search -# w/o markings as we do not have cycles -def DAG::bfs n, target_label - queue = [n] - while !queue.empty? - m = queue.shift - return m if m.label==target_label - m.outgoing.each { |e| queue << e.head } - end - return nil -end - -# topological sort -def DAG::topological_sort graph - sorted = [] - s = graph.reject { |n| !n.incoming.empty? } - while !s.empty? - sorted << s.shift - sorted.last.outgoing.each { |e| - e.mark = true - s << e.head if e.head.incoming.reject{|f| f.mark}.empty? - } - end - return sorted -end - -# initialize graph scores with semiring One -def DAG::init graph, semiring, source_node - graph.each {|n| n.score=semiring.null} - source_node.score = semiring.one -end - -# viterbi -def DAG::viterbi graph, semiring=ViterbiSemiring, source_node - toposorted = DAG::topological_sort(graph) - DAG::init(graph, semiring, source_node) - toposorted.each { |n| - n.incoming.each { |e| - # update - n.score = \ - semiring.add.call(n.score, \ - semiring.multiply.call(e.tail.score, e.weight) - ) - } - } -end - -# forward viterbi -def DAG::viterbi_forward graph, semiring=ViterbiSemiring, source_node - toposorted = DAG::topological_sort(graph) - DAG::init(graph, semiring, source_node) - toposorted.each { |n| - n.outgoing.each { |e| - e.head.score = \ - semiring.add.call(e.head.score, \ - semiring.multiply.call(n.score, e.weight) - ) - } - } -end - -# Dijkstra algorithm -# for A*-search we would need an optimistic estimate of -# future cost at each node -def DAG::dijkstra graph, semiring=RealSemiring.new, source_node - DAG::init(graph, semiring, source_node) - q = PriorityQueue.new graph - while !q.empty? - n = q.pop - n.outgoing.each { |e| - e.head.score = \ - semiring.add.call(e.head.score, \ - semiring.multiply.call(n.score, e.weight)) - q.sort! - } - end -end - -# Bellman-Ford algorithm -def DAG::bellman_ford(graph, semiring=RealSemiring.new, source_node) - DAG::init(graph, semiring, source_node) - edges = [] - graph.each { |n| edges |= n.outgoing } - # relax edges - (graph.size-1).times{ |i| - edges.each { |e| - e.head.score = \ - semiring.add.call(e.head.score, \ - semiring.multiply.call(e.tail.score, e.weight)) - } - } - # we do not allow cycles (negative or positive) -end - -# Floyd algorithm -def DAG::floyd(graph, semiring=nil) - dist_matrix = [] - graph.each_index { |i| - dist_matrix << [] - graph.each_index { |j| - val = 1.0/0.0 - val = 0.0 if i==j - dist_matrix.last << val - } - } - edges = [] - graph.each { |n| edges |= n.outgoing } - edges.each { |e| - dist_matrix[graph.index(e.tail)][graph.index(e.head)] = e.weight - } - 0.upto(graph.size-1) { |k| - 0.upto(graph.size-1) { |i| - 0.upto(graph.size-1) { |j| - if dist_matrix[i][k] + dist_matrix[k][j] < dist_matrix[i][j] - dist_matrix [i][j] = dist_matrix[i][k] + dist_matrix[k][j] - end - } - } - } - return dist_matrix -end - - -# returns a list of nodes (graph) and a hash for finding -# nodes by their label (these need to be unique!) -def DAG::read_graph_from_json fn, semiring=RealSemiring.new - graph = [] - nodes_by_label = {} - h = JSON.parse File.new(fn).read - h['nodes'].each { |i| - n = DAG::Node.new i['label'] - graph << n - nodes_by_label[n.label] = n - } - h['edges'].each { |i| - n = nodes_by_label[i['tail']] - a = n.add_edge(nodes_by_label[i['head']], semiring.convert.call(i['weight'].to_f)) - nodes_by_label[i['head']].incoming << a - } - return graph, nodes_by_label -end - - -end # module - diff --git a/lib/nlp_ruby/fileutil.rb b/lib/nlp_ruby/fileutil.rb deleted file mode 100644 index eb69136..0000000 --- a/lib/nlp_ruby/fileutil.rb +++ /dev/null @@ -1,88 +0,0 @@ -require 'zlib' - - -class ReadFile - - def initialize fn, encoding='utf-8' - if fn.split('.').last == 'gz' - @f = Zlib::GzipReader.new(File.new(fn, 'rb'), :external_encoding=>encoding) - elsif fn == '-' - @f = STDIN - STDIN.set_encoding encoding - else - @f = File.new fn, 'r' - @f.set_encoding encoding - end - end - - def gets - @f.gets { |line| yield line } - end - - def readlines - @f.readlines - end - - def self.readlines fn, encoding='utf-8' - f = ReadFile.new fn, encoding - r = f.readlines - f.close - return r - end - - def readlines_strip - self.readlines.map{ |i| i.strip } - end - - def self.readlines_strip fn, encoding='utf-8' - f = ReadFile.new fn, encoding - r = f.readlines_strip - f.close - return r - end - - def read - @f.read - end - - def self.read fn, encoding='utf-8' - f = ReadFile.new fn, encoding - r = f.read - f.close - return r - end - - def close - @f.close if @f!=STDIN - end -end - -class WriteFile - - def initialize fn, encoding='utf-8' - if fn.split('.').last == 'gz' - @f = Zlib::GzipWriter.new(File.new(fn, 'wb+'), :external_encoding=>encoding) - elsif fn == '-' - @f = STDOUT - STDOUT.set_encoding encoding - else - @f = File.new fn, 'w+' - @f.set_encoding encoding - end - end - - def write s - @f.write s - end - - def self.write s, fn, encoding='utf-8' - f = WriteFile.new fn, encoding - f.write s - f.close - end - - def close - @f.close if @f!=STDIN - end -end - diff --git a/lib/nlp_ruby/grammar.rb b/lib/nlp_ruby/grammar.rb deleted file mode 100644 index 7bd8fe6..0000000 --- a/lib/nlp_ruby/grammar.rb +++ /dev/null @@ -1,122 +0,0 @@ -module Grammar - -class T - attr_accessor :word - - def initialize word - @word = word - end - - def to_s - "T<#{@word}>" - end -end - -class NT - attr_accessor :symbol, :index, :span - - def initialize symbol, index=0 - @symbol = symbol - @index = index - @span = Span.new - end - - def to_s - "NT(#{@span.left},#{@span.right})<#{@symbol},#{@index}>" - end -end - -class Rule - attr_accessor :lhs, :rhs, :e - - def initialize lhs=nil, rhs=[], e='' - @lhs = lhs - @rhs = rhs - @e = e - end - - def to_s - "#{lhs} -> #{rhs.map{ |i| i.to_s }.join ' '} [arity=#{arity}] ||| #{@e}" - end - - def arity - rhs.select { |i| i.class == NT }.size - end - - def from_s s - _ = splitpipe s, 3 - @lhs = NT.new _[0].strip.gsub!(/(\[|\])/, "") - _[1].split.each { |x| - x.strip! - if x[0]=='[' && x[x.size-1] == ']' - @rhs << NT.new(x.gsub!(/(\[|\])/, "").split(',')[0]) - else - @rhs << T.new(x) - end - } - @e = _[2] - end - - def self.from_s s - r = self.new - r.from_s s - return r - end -end - -class Span - attr_accessor :left, :right - - def initialize left=nil, right=nil - @left = left - @right = right - end -end - -class Grammar - attr_accessor :rules, :startn, :startt, :flat - - def initialize fn - @rules = []; @startn = []; @startt = [] ;@flat = [] - ReadFile.readlines_strip(fn).each_with_index { |s,i| - STDERR.write '.'; STDERR.write " #{i+1}\n" if (i+1)%80==0 - @rules << Rule.from_s(s) - if @rules.last.rhs.first.class == NT - @startn << @rules.last - else - if rules.last.arity == 0 - @flat << @rules.last - else - @startt << @rules.last - end - end - } - STDERR.write "\n" - end - - def to_s - s = '' - @rules.each { |r| s += r.to_s+"\n" } - return s - end - - def add_glue_rules - @rules.map { |r| r.lhs.symbol }.select { |s| s != 'S' }.uniq.each { |symbol| - @rules << Rule.new(NT.new('S'), [NT.new(symbol)]) - @startn << @rules.last - @rules << Rule.new(NT.new('S'), [NT.new('S'), NT.new('X')]) - @startn << @rules.last - } - end - - def add_pass_through_rules s - s.each { |word| - @rules << Rule.new(NT.new('X'), [T.new(word)]) - @flat << @rules.last - } - end -end - - -end # module - diff --git a/lib/nlp_ruby/hg.rb b/lib/nlp_ruby/hg.rb deleted file mode 100644 index b8b147e..0000000 --- a/lib/nlp_ruby/hg.rb +++ /dev/null @@ -1,173 +0,0 @@ -#!/usr/bin/env ruby - -require 'nlp_ruby' -require 'json' - - -module HG - - -class HG::Node - attr_accessor :label, :cat, :outgoing, :incoming, :score - - def initialize label=nil, cat=nil, outgoing=[], incoming=[], score=nil - @label = label - @cat = cat - @outgoing = outgoing - @incoming = incoming - @score = nil - end - - def to_s - "Node" - end -end - -class HG::Hypergraph - attr_accessor :nodes, :edges - - def initialize nodes=[], edges=[] - @nodes = nodes - @edges = edges - end - - def arity - @edges.map { |e| e.arity }.max - end - - def to_s - "Hypergraph" - end -end - -class HG::Hyperedge - attr_accessor :head, :tails, :weight, :f, :mark, :rule, :left, :right - - def initialize head=nil, tails=[], weight=0.0, f={} - @head = head - @tails = tails - @weight = weight - @f = f - @mark = 0 - end - - def arity - return @tails.size - end - - def marked? - arity == @mark - end - - def to_s - "Hyperedge" - end -end - -def HG::topological_sort nodes - sorted = [] - s = nodes.reject { |n| !n.incoming.empty? } - while !s.empty? - sorted << s.shift - sorted.last.outgoing.each { |e| - next if e.marked? - e.mark += 1 - s << e.head if e.head.incoming.reject{ |f| f.mark==f.arity }.empty? - } - end - return sorted -end - -def HG::init nodes, semiring, root - nodes.each { |n| n.score=semiring.null } - root.score = semiring.one -end - -def HG::viterbi hypergraph, root, semiring=ViterbiSemiring.new - toposorted = topological_sort hypergraph.nodes - init toposorted, semiring, root - toposorted.each { |n| - n.incoming.each { |e| - s = semiring.one - e.tails.each { |m| - s = semiring.multiply.call(s, m.score) - } - n.score = semiring.add.call(n.score, semiring.multiply.call(s, e.weight)) - } - } -end - -def HG::viterbi_path hypergraph, root, semiring=ViterbiSemiring.new - toposorted = topological_sort hypergraph.nodes - init toposorted, semiring, root - best_path = [] - toposorted.each { |n| - best_edge = nil - n.incoming.each { |e| - s = semiring.one - e.tails.each { |m| - s = semiring.multiply.call(s, m.score) - } - if n.score < semiring.multiply.call(s, e.weight) # ViterbiSemiring add - best_edge = e - end - n.score = semiring.add.call(n.score, semiring.multiply.call(s, e.weight)) - } - best_path << best_edge - } - return best_path, toposorted.last.score -end - -def HG::read_hypergraph_from_json fn, semiring=RealSemiring.new, log_weights=false - nodes = [] - edges = [] - nodes_by_label = {} - nodes_by_index = [] - h = JSON.parse File.new(fn).read - w = SparseVector.from_h h['weights'] - h['nodes'].each { |i| - n = Node.new i['label'], i['cat'] - nodes << n - nodes_by_label[n.label] = n - nodes_by_index << n - } - h['edges'].each { |i| - e = Hyperedge.new(nodes_by_label[i['head']], \ - i['tails'].map{|j| nodes_by_label[j]}.to_a, \ - semiring.convert.call(i['weight'].to_f), \ - {}) - e.f = SparseVector.from_h i['f'] - if log_weights - e.weight = Math.exp(w.dot(e.f)) - else - e.weight = w.dot(e.f) - end - e.tails.each { |m| - m.outgoing << e - } - e.head.incoming << e - edges << e - } - return Hypergraph.new(nodes, edges), nodes_by_label, nodes_by_index -end - -def HG::all_paths hypergraph, root, semiring=ViterbiSemiring.new - toposorted = topological_sort hypergraph.nodes - paths = [[]] - toposorted.each { |n| - next if n.incoming.empty? - new_paths = [] - while !paths.empty? - p = paths.pop - n.incoming.each { |e| - new_paths << p+[e] - } - end - paths = new_paths - } - return paths -end - - -end #module - diff --git a/lib/nlp_ruby/misc.rb b/lib/nlp_ruby/misc.rb deleted file mode 100644 index 0319a5f..0000000 --- a/lib/nlp_ruby/misc.rb +++ /dev/null @@ -1,114 +0,0 @@ -require 'timeout' - - -class Array - def max_index - self.index(self.max) - end - - def is_subset_of? other - self.each { |i| - if other.include? i - return false - end - } - return true - end - - def sum - self.inject(:+) - end - - def mean - self.sum.to_f/self.size - end -end - -class String - - def downcase? - self[/[[:lower:]]/] - end -end - -class PriorityQueue -# This assumes that elements in the queue -# have a numerical member named 'score'. - - def initialize a=Array.new - @queue = Array.new a - sort! - end - - def sort! - @queue.sort_by! { |i| -i.score } - end - - def pop - @queue.pop - end - - def push i - @queue << i - sort! - end - - def empty? - @queue.empty? - end -end - -def spawn_with_timeout cmd, t=4, ignore_fail=false, debug=false - STDERR.write cmd+"\n" if debug - pipe_in, pipe_out = IO.pipe - pid = Process.spawn(cmd, :out => pipe_out) - begin - Timeout.timeout(t) { Process.wait pid } - rescue Timeout::Error - Process.kill('TERM', pid) if !ignore_fail - end - pipe_out.close - return pipe_in.read -end - -def read_phrase_table fn - table = {} - f = ReadFile.new fn - while raw_rule = f.gets - french, english, features = splitpipe(raw_rule) - feature_map = SparseVector.from_kv features - if table.has_key? french - table[french] << [english, feature_map ] - else - table[french] = [[english, feature_map]] - end - end - f.close - return table -end - -def cdec_kbest cdec_bin, input, ini, weights, k, unique=true - require 'open3' - cmd = "echo \"#{input}\" | #{cdec_bin} -c #{ini} -w #{weights} -k #{k}" - cmd += " -r" if unique - o,_ = Open3.capture2 "#{cmd} 2>/dev/null" - a = []; j = -1 - o.split("\n").map{ |i| j+=1; t=Translation.new; t.from_s(i, false, j); a << t } - return a -end - -def read_config fn - f = ReadFile.new fn - cfg = {} - while line = f.gets - line.strip! - next if /^\s*$/.match line - next if line[0]=='#' - content = line.split('#', 2).first - k, v = content.split(/\s*=\s*/, 2) - k.strip!; v.strip! - cfg[k] = v - end - return cfg -end - diff --git a/lib/nlp_ruby/semirings.rb b/lib/nlp_ruby/semirings.rb deleted file mode 100644 index fda4683..0000000 --- a/lib/nlp_ruby/semirings.rb +++ /dev/null @@ -1,79 +0,0 @@ -# Semirings for directed acyclic graphs (dags) (also directed hypergraphs), -# as described in: -# 'Dynamic Programming Algorithms in -# Semiring and Hypergraph Frameworks' (Liang Huang) -class Semiring - attr_accessor :add, :multiply, :one, :null, :convert -end - -class BooleanSemiring < Semiring - def initialize - @add = Proc.new { |a,b| a||b } - @multiply = Proc.new { |a,b| a&&b } - @one = true - @null = false - @convert = Proc.new { |v| true && v!=0 } - end -end - -class ViterbiSemiring < Semiring - def initialize - @add = Proc.new { |a,b| [a,b].max } - @multiply = Proc.new { |a,b| a*b } - @one = 1.0 - @null = 0.0 - @convert = Proc.new { |v| v } - end -end - -class ViterbiLogSemiring < Semiring - def initialize - @add = Proc.new { |a,b| [a,b].max } - @multiply = Proc.new { |a,b| a+b } - @one = 0.0 - @null = -1.0/0.0 - @convert = Proc.new { |v| v } - end -end - -class InsideSemiring < Semiring - def initialize - @add = Proc.new { |a,b| a+b } - @multiply = Proc.new { |a,b| a*b } - @one = 1.0 - @null = 0.0 - @convert = Proc.new { |v| v } - end -end - -class RealSemiring < Semiring - def initialize - @add = Proc.new { |a,b| [a,b].min } - @multiply = Proc.new { |a,b| a+b } - @one = 0.0 - @null = 1.0/0.0 - @convert = Proc.new { |v| v } - end -end - -# for longest/worst paths -class RealxSemiring < Semiring - def initialize - @add = Proc.new { |a,b| [a,b].max } - @multiply = Proc.new { |a,b| a+b } - @one = -1.0/0.0 - @null = 0.0 - @convert = Proc.new { |v| v } - end -end - -class CountingSemiring < Semiring - def initialize - @add = Proc.new { |a,b| a+b } - @multiply = Proc.new { |a,b| a*b } - @one = 1.0 - @null = 0.0 - @convert = Proc.new { |v| if v!=0 then 1 else 0 end } - end -end - diff --git a/lib/nlp_ruby/stringutil.rb b/lib/nlp_ruby/stringutil.rb deleted file mode 100644 index aa9be00..0000000 --- a/lib/nlp_ruby/stringutil.rb +++ /dev/null @@ -1,22 +0,0 @@ -def tokenize s - s.strip.split -end - -def ngrams(s, n, fix=false) - a = tokenize s - a.each_with_index { |tok, i| - tok.strip! - 0.upto([n-1, a.size-i-1].min) { |m| - yield a[i..i+m] if !fix||(fix&&a[i..i+m].size==n) - } - } -end - -def bag_of_words s, stopwords=[] - s.strip.split.uniq.sort.reject{ |w| stopwords.include? w } -end - -def splitpipe s, n=3 - s.strip.split("|"*n) -end - diff --git a/lib/nlp_ruby/tfidf.rb b/lib/nlp_ruby/tfidf.rb deleted file mode 100644 index 13a40a3..0000000 --- a/lib/nlp_ruby/tfidf.rb +++ /dev/null @@ -1,38 +0,0 @@ -module TFIDF - - -# returns key='raw frequency' for an -# array-like object -def TFIDF::tf array, stopwords=[] - v = {}; v.default = 0 - array.uniq.each { |i| - next if stopwords.include? i - v[i] = array.count(i).to_f - } - return v -end - -# smoothes raw frequencies of tf() in-place -# a is a smoothing term -def TFIDF::ntf hash, a=0.4 - max = hash.values.max.to_f - hash.each_pair { |k,v| - hash[k] = a + (1-a)*(v/max) - } -end - -# returns idf value for each word in a vocabulary -def TFIDF::idf list_of_hashes - vocab = list_of_hashes.values.flatten.uniq - n = list_of_hashes.size.to_f - idf = {} - vocab.each { |i| - df = list_of_hashes.values.flatten.count i - idf[i] = Math.log(n/df) - } - return idf -end - - -end #module - diff --git a/lib/zipf.rb b/lib/zipf.rb new file mode 100755 index 0000000..0e26e97 --- /dev/null +++ b/lib/zipf.rb @@ -0,0 +1,18 @@ +#!/usr/bin/env ruby + +require 'nlp_ruby/stringutil' +require 'nlp_ruby/fileutil' +require 'nlp_ruby/SparseVector' +require 'nlp_ruby/tfidf' +require 'nlp_ruby/Translation' +require 'nlp_ruby/dag' +require 'nlp_ruby/semirings' +require 'nlp_ruby/bleu' +require 'nlp_ruby/misc' +require 'nlp_ruby/hg' +require 'nlp_ruby/grammar' + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' +STDERR.set_encoding 'utf-8' + diff --git a/lib/zipf/SparseVector.rb b/lib/zipf/SparseVector.rb new file mode 100644 index 0000000..3096412 --- /dev/null +++ b/lib/zipf/SparseVector.rb @@ -0,0 +1,172 @@ +class SparseVector < Hash + + def initialize arg=nil + super + self.default = 0 + if arg.is_a? Array + from_a arg + end + end + + def from_a a + a.each_with_index { |i,j| self[j] = i } + end + + def self.from_a a + v = SparseVector.new + v.from_a a + return v + end + + def from_h h + h.each_pair { |k,v| self[k] = v } + end + + def self.from_h h + v = SparseVector.new + v.from_h h + return v + end + + def from_s s + from_h eval(s) + end + + def self.from_s s + v = SparseVector.new + v.from_s s + return v + end + + def to_kv sep='=', join=' ' + a = [] + self.each_pair { |k,v| + a << "#{k}#{sep}#{v}" + } + return a.join join + end + + def from_kv s + s.split.each { |i| + k,v = i.split('=') + self[k] = v.to_f + } + end + + def self.from_kv s + v = SparseVector.new + v.from_kv s + return v + end + + def from_file fn, sep='=' + f = ReadFile.new(fn) + while line = f.gets + key, value = line.strip.split sep + value = value.to_f + self[key] = value + end + end + + def self.from_file fn, sep='=' + v = SparseVector.new + v.from_file fn, sep + return v + end + + def join_keys other + self.keys + other.keys + end + + def sum + self.values.inject(:+) + end + + def approx_eql? other, p=10**-10 + return false if !other + return false if other.size!=self.size + return false if other.keys.sort!=self.keys.sort + self.keys.each { |k| + return false if (self[k]-other[k]).abs>p + } + return true + end + + def average + self.sum/self.size.to_f + end + + def variance + avg = self.average + var = 0.0 + self.values.each { |i| var += (avg - i)**2 } + return var + end + + def stddev + Math.sqrt self.variance + end + + def dot other + sum = 0.0 + self.each_pair { |k,v| sum += v * other[k] } + return sum + end + + def zeros n + (0).upto(n-1) { |i| self[i] = 0.0 } + end + + def magnitude + Math.sqrt self.values.inject { |sum,i| sum+i**2 } + end + + def cosinus_sim other + self.dot(other)/(self.magnitude*other.magnitude) + end + + def euclidian_dist other + dims = [self.keys, other.keys].flatten.uniq + sum = 0.0 + dims.each { |d| sum += (self[d] - other[d])**2 } + return Math.sqrt(sum) + end + + def + other + new = SparseVector.new + join_keys(other).each { |k| + new[k] = self[k]+other[k] + } + return new + end + + def - other + new = SparseVector.new + join_keys(other).each { |k| + new[k] = self[k]-other[k] + } + return new + end + + def * scalar + raise ArgumentError, "Arg is not numeric #{scalar}" unless scalar.is_a? Numeric + new = SparseVector.new + self.keys.each { |k| + new[k] = self[k] * scalar + } + return new + end + + def self.mean a + mean = SparseVector.new + a.each { |i| + i.each_pair { |k,v| + mean[k] += v + } + } + n = a.size.to_f + mean.each_pair { |k,v| mean[k] = v/n } + return mean + end +end + diff --git a/lib/zipf/Translation.rb b/lib/zipf/Translation.rb new file mode 100644 index 0000000..3759a1d --- /dev/null +++ b/lib/zipf/Translation.rb @@ -0,0 +1,72 @@ +class Translation + attr_accessor :id, :s, :raw, :f, :scores, :rank + + def initialize id=nil, raw=nil, s=nil, f=nil, scores={}, rank=nil + @id = id + @raw = raw + @s = s + @f = f + @scores = scores + @rank = rank + end + + def from_s t, strip_alignment=true, rank=nil + id, raw, features, score = splitpipe(t, 3) + raw.strip! + @raw = raw + if strip_alignment # the way moses does it + @s = @raw.gsub(/\s*\|\d+-\d+\||\|-?\d+\|\s*/, ' ').gsub(/\s+/, ' ') + @s.strip! + else + @s = raw + end + @id = id.to_i + @f = SparseVector.from_kv features + @scores[:decoder] = score.to_f + @rank = rank + end + + def self.from_s s + t = self.new + t.from_s s + return t + end + + def to_s include_features=true + [@id, @s, @f.to_kv('=', ' '), @scores[:decoder]].join(' ||| ') if include_features + [@id, @s, @scores[:decoder]].join(' ||| ') if !include_features + end + + def to_s2 + [@rank, @s, @score, @scores.to_s].join ' ||| ' + end +end + +def read_kbest_lists fn, translation_type=Translation + kbest_lists = [] + cur = [] + f = ReadFile.new fn + prev = -1 + c = 0 + id = 0 + while line = f.gets + t = translation_type.new + t.from_s line + c = splitpipe(line)[0].to_i + if c != prev + if cur.size > 0 + kbest_lists << cur + cur = [] + end + prev = c + id = 0 + end + t.id = id + cur << t + id += 1 + end + kbest_lists << cur # last one + f.close + return kbest_lists +end + diff --git a/lib/zipf/bleu.rb b/lib/zipf/bleu.rb new file mode 100644 index 0000000..69de00b --- /dev/null +++ b/lib/zipf/bleu.rb @@ -0,0 +1,130 @@ +module BLEU + + +class BLEU::NgramCounts + attr_accessor :sum, :clipped, :ref_len, :hyp_len, :n + + def initialize(n) + @n = 0 + @sum = [] + @clipped = [] + @ref_len = 0.0 + @hyp_len = 0.0 + grow(n) + end + + def grow(n) + (n-@n).times { + @sum << 0.0 + @clipped << 0.0 + } + @n = n + end + + def plus_eq(other) + if other.n > @n then grow(other.n) end + 0.upto(other.n-1) { |m| + @sum[m] += other.sum[m] + @clipped[m] += other.clipped[m] + } + @ref_len += other.ref_len + @hyp_len += other.hyp_len + end + + def to_s + return "n=#{n} sum=#{sum} clipped=#{clipped} ref_len=#{ref_len} hyp_len=#{hyp_len}" + end +end + +class BLEU::Ngrams + def initialize + @h_ = {} + @h_.default = 0 + end + + def add(k) + if k.class == Array then k = k.join ' ' end + @h_[k] += 1 + end + + def get_count(k) + if k.class == Array then k = k.join ' ' end + return @h_[k] + end + + def each + @h_.each_pair { |k,v| + yield k.split, v + } + end + + def to_s + @h_.to_s + end +end + +def BLEU::get_counts hypothesis, reference, n, times=1 + p = NgramCounts.new n + r = Ngrams.new + ngrams(reference, n) { |ng| r.add ng } + h = Ngrams.new + ngrams(hypothesis, n) { |ng| h.add ng } + h.each { |ng,count| + sz = ng.size-1 + p.sum[sz] += count * times + p.clipped[sz] += [r.get_count(ng), count].min * times + } + p.ref_len = tokenize(reference.strip).size * times + p.hyp_len = tokenize(hypothesis.strip).size * times + return p +end + +def BLEU::brevity_penalty c, r, smooth=0.0 + return [0.0, 1.0-((r+smooth)/c)].min +end + +def BLEU::bleu counts, n, debug=false + corpus_stats = NgramCounts.new n + counts.each { |i| corpus_stats.plus_eq i } + logbleu = 0.0 + 0.upto(n-1) { |m| + STDERR.write "#{m+1} #{corpus_stats.clipped[m]} / #{corpus_stats.sum[m]}\n" if debug + return 0.0 if corpus_stats.clipped[m] == 0 or corpus_stats.sum == 0 + logbleu += Math.log(corpus_stats.clipped[m]) - Math.log(corpus_stats.sum[m]) + } + logbleu /= n + if debug + STDERR.write "BP #{brevity_penalty(corpus_stats.hyp_len, corpus_stats.ref_len)}\n" + STDERR.write "sum #{Math.exp(sum)}\n" + end + logbleu += brevity_penalty corpus_stats.hyp_len, corpus_stats.ref_len + return Math.exp logbleu +end + +def BLEU::hbleu counts, n, debug=false + (100*bleu(counts, n, debug)).round(3) +end + +def BLEU::per_sentence_bleu hypothesis, reference, n=4, smooth=0.0 + h_ng = {}; r_ng = {} + (1).upto(n) { |i| h_ng[i] = []; r_ng[i] = [] } + ngrams(hypothesis, n) { |i| h_ng[i.size] << i } + ngrams(reference, n) { |i| r_ng[i.size] << i } + m = [n, reference.split.size].min + add = 0.0 + logbleu = 0.0 + (1).upto(m) { |i| + counts_clipped = 0 + counts_sum = h_ng[i].size + h_ng[i].uniq.each { |j| counts_clipped += r_ng[i].count(j) } + add = 1.0 if i >= 2 + logbleu += Math.log(counts_clipped+add) - Math.log(counts_sum+add); + } + logbleu /= m + logbleu += brevity_penalty hypothesis.strip.split.size, reference.strip.split.size, smooth + return Math.exp logbleu +end + + +end #module + diff --git a/lib/zipf/dag.rb b/lib/zipf/dag.rb new file mode 100644 index 0000000..45ede20 --- /dev/null +++ b/lib/zipf/dag.rb @@ -0,0 +1,205 @@ +module DAG + +require 'json' + + +class DAG::Node + attr_accessor :label, :outgoing, :incoming, :score, :mark + + def initialize label=nil, outgoing=[], incoming=[], score=nil + @label = label + @outgoing = outgoing + @incoming = incoming + @score = nil + end + + def add_edge head, weight=0 + exit if self==head # no self-cycles! + @outgoing << DAG::Edge.new(self, head, weight) + return @outgoing.last + end + + def to_s + "DAG::Node" + end + + def repr + "#{to_s} #{@score} out:#{@outgoing} in:[#{@incoming.map{|e| e.to_s}.join ', '}]" + end +end + +class DAG::Edge + attr_accessor :tail, :head, :weight, :mark + + def initialize tail=nil, head=nil, weight=0 + @tail = tail + @head = head + @weight = weight + @mark = false # did we already follow this edge? -- for topological sorting + end + + def to_s + s = "DAG::Edge<#{@tail} ->[#{weight}] #{@head}" + s += " x" if @mark + s += ">" + s + end +end + +# depth-first search +# w/o markings as we do not have cycles +def DAG::dfs n, target_label + return n if n.label==target_label # assumes uniq labels! + stack = n.outgoing.map { |i| i.head } + while !stack.empty? + m = stack.pop + return DAG::dfs m, target_label + end + return nil +end + +# breadth-first search +# w/o markings as we do not have cycles +def DAG::bfs n, target_label + queue = [n] + while !queue.empty? + m = queue.shift + return m if m.label==target_label + m.outgoing.each { |e| queue << e.head } + end + return nil +end + +# topological sort +def DAG::topological_sort graph + sorted = [] + s = graph.reject { |n| !n.incoming.empty? } + while !s.empty? + sorted << s.shift + sorted.last.outgoing.each { |e| + e.mark = true + s << e.head if e.head.incoming.reject{|f| f.mark}.empty? + } + end + return sorted +end + +# initialize graph scores with semiring One +def DAG::init graph, semiring, source_node + graph.each {|n| n.score=semiring.null} + source_node.score = semiring.one +end + +# viterbi +def DAG::viterbi graph, semiring=ViterbiSemiring, source_node + toposorted = DAG::topological_sort(graph) + DAG::init(graph, semiring, source_node) + toposorted.each { |n| + n.incoming.each { |e| + # update + n.score = \ + semiring.add.call(n.score, \ + semiring.multiply.call(e.tail.score, e.weight) + ) + } + } +end + +# forward viterbi +def DAG::viterbi_forward graph, semiring=ViterbiSemiring, source_node + toposorted = DAG::topological_sort(graph) + DAG::init(graph, semiring, source_node) + toposorted.each { |n| + n.outgoing.each { |e| + e.head.score = \ + semiring.add.call(e.head.score, \ + semiring.multiply.call(n.score, e.weight) + ) + } + } +end + +# Dijkstra algorithm +# for A*-search we would need an optimistic estimate of +# future cost at each node +def DAG::dijkstra graph, semiring=RealSemiring.new, source_node + DAG::init(graph, semiring, source_node) + q = PriorityQueue.new graph + while !q.empty? + n = q.pop + n.outgoing.each { |e| + e.head.score = \ + semiring.add.call(e.head.score, \ + semiring.multiply.call(n.score, e.weight)) + q.sort! + } + end +end + +# Bellman-Ford algorithm +def DAG::bellman_ford(graph, semiring=RealSemiring.new, source_node) + DAG::init(graph, semiring, source_node) + edges = [] + graph.each { |n| edges |= n.outgoing } + # relax edges + (graph.size-1).times{ |i| + edges.each { |e| + e.head.score = \ + semiring.add.call(e.head.score, \ + semiring.multiply.call(e.tail.score, e.weight)) + } + } + # we do not allow cycles (negative or positive) +end + +# Floyd algorithm +def DAG::floyd(graph, semiring=nil) + dist_matrix = [] + graph.each_index { |i| + dist_matrix << [] + graph.each_index { |j| + val = 1.0/0.0 + val = 0.0 if i==j + dist_matrix.last << val + } + } + edges = [] + graph.each { |n| edges |= n.outgoing } + edges.each { |e| + dist_matrix[graph.index(e.tail)][graph.index(e.head)] = e.weight + } + 0.upto(graph.size-1) { |k| + 0.upto(graph.size-1) { |i| + 0.upto(graph.size-1) { |j| + if dist_matrix[i][k] + dist_matrix[k][j] < dist_matrix[i][j] + dist_matrix [i][j] = dist_matrix[i][k] + dist_matrix[k][j] + end + } + } + } + return dist_matrix +end + + +# returns a list of nodes (graph) and a hash for finding +# nodes by their label (these need to be unique!) +def DAG::read_graph_from_json fn, semiring=RealSemiring.new + graph = [] + nodes_by_label = {} + h = JSON.parse File.new(fn).read + h['nodes'].each { |i| + n = DAG::Node.new i['label'] + graph << n + nodes_by_label[n.label] = n + } + h['edges'].each { |i| + n = nodes_by_label[i['tail']] + a = n.add_edge(nodes_by_label[i['head']], semiring.convert.call(i['weight'].to_f)) + nodes_by_label[i['head']].incoming << a + } + return graph, nodes_by_label +end + + +end #module + diff --git a/lib/zipf/fileutil.rb b/lib/zipf/fileutil.rb new file mode 100644 index 0000000..eb69136 --- /dev/null +++ b/lib/zipf/fileutil.rb @@ -0,0 +1,88 @@ +require 'zlib' + + +class ReadFile + + def initialize fn, encoding='utf-8' + if fn.split('.').last == 'gz' + @f = Zlib::GzipReader.new(File.new(fn, 'rb'), :external_encoding=>encoding) + elsif fn == '-' + @f = STDIN + STDIN.set_encoding encoding + else + @f = File.new fn, 'r' + @f.set_encoding encoding + end + end + + def gets + @f.gets { |line| yield line } + end + + def readlines + @f.readlines + end + + def self.readlines fn, encoding='utf-8' + f = ReadFile.new fn, encoding + r = f.readlines + f.close + return r + end + + def readlines_strip + self.readlines.map{ |i| i.strip } + end + + def self.readlines_strip fn, encoding='utf-8' + f = ReadFile.new fn, encoding + r = f.readlines_strip + f.close + return r + end + + def read + @f.read + end + + def self.read fn, encoding='utf-8' + f = ReadFile.new fn, encoding + r = f.read + f.close + return r + end + + def close + @f.close if @f!=STDIN + end +end + +class WriteFile + + def initialize fn, encoding='utf-8' + if fn.split('.').last == 'gz' + @f = Zlib::GzipWriter.new(File.new(fn, 'wb+'), :external_encoding=>encoding) + elsif fn == '-' + @f = STDOUT + STDOUT.set_encoding encoding + else + @f = File.new fn, 'w+' + @f.set_encoding encoding + end + end + + def write s + @f.write s + end + + def self.write s, fn, encoding='utf-8' + f = WriteFile.new fn, encoding + f.write s + f.close + end + + def close + @f.close if @f!=STDIN + end +end + diff --git a/lib/zipf/grammar.rb b/lib/zipf/grammar.rb new file mode 100644 index 0000000..568b9fc --- /dev/null +++ b/lib/zipf/grammar.rb @@ -0,0 +1,123 @@ +module Grammar + + +class T + attr_accessor :word + + def initialize word + @word = word + end + + def to_s + "T<#{@word}>" + end +end + +class NT + attr_accessor :symbol, :index, :span + + def initialize symbol, index=0 + @symbol = symbol + @index = index + @span = Span.new + end + + def to_s + "NT(#{@span.left},#{@span.right})<#{@symbol},#{@index}>" + end +end + +class Rule + attr_accessor :lhs, :rhs, :e + + def initialize lhs=nil, rhs=[], e='' + @lhs = lhs + @rhs = rhs + @e = e + end + + def to_s + "#{lhs} -> #{rhs.map{ |i| i.to_s }.join ' '} [arity=#{arity}] ||| #{@e}" + end + + def arity + rhs.select { |i| i.class == NT }.size + end + + def from_s s + _ = splitpipe s, 3 + @lhs = NT.new _[0].strip.gsub!(/(\[|\])/, "") + _[1].split.each { |x| + x.strip! + if x[0]=='[' && x[x.size-1] == ']' + @rhs << NT.new(x.gsub!(/(\[|\])/, "").split(',')[0]) + else + @rhs << T.new(x) + end + } + @e = _[2] + end + + def self.from_s s + r = self.new + r.from_s s + return r + end +end + +class Span + attr_accessor :left, :right + + def initialize left=nil, right=nil + @left = left + @right = right + end +end + +class Grammar + attr_accessor :rules, :startn, :startt, :flat + + def initialize fn + @rules = []; @startn = []; @startt = [] ;@flat = [] + ReadFile.readlines_strip(fn).each_with_index { |s,i| + STDERR.write '.'; STDERR.write " #{i+1}\n" if (i+1)%80==0 + @rules << Rule.from_s(s) + if @rules.last.rhs.first.class == NT + @startn << @rules.last + else + if rules.last.arity == 0 + @flat << @rules.last + else + @startt << @rules.last + end + end + } + STDERR.write "\n" + end + + def to_s + s = '' + @rules.each { |r| s += r.to_s+"\n" } + return s + end + + def add_glue_rules + @rules.map { |r| r.lhs.symbol }.select { |s| s != 'S' }.uniq.each { |symbol| + @rules << Rule.new(NT.new('S'), [NT.new(symbol)]) + @startn << @rules.last + @rules << Rule.new(NT.new('S'), [NT.new('S'), NT.new('X')]) + @startn << @rules.last + } + end + + def add_pass_through_rules s + s.each { |word| + @rules << Rule.new(NT.new('X'), [T.new(word)]) + @flat << @rules.last + } + end +end + + +end #module + diff --git a/lib/zipf/hg.rb b/lib/zipf/hg.rb new file mode 100644 index 0000000..f86bf60 --- /dev/null +++ b/lib/zipf/hg.rb @@ -0,0 +1,173 @@ +#!/usr/bin/env ruby + +require_relative 'semirings' +require 'json' + + +module HG + + +class HG::Node + attr_accessor :label, :cat, :outgoing, :incoming, :score + + def initialize label=nil, cat=nil, outgoing=[], incoming=[], score=nil + @label = label + @cat = cat + @outgoing = outgoing + @incoming = incoming + @score = nil + end + + def to_s + "Node" + end +end + +class HG::Hypergraph + attr_accessor :nodes, :edges + + def initialize nodes=[], edges=[] + @nodes = nodes + @edges = edges + end + + def arity + @edges.map { |e| e.arity }.max + end + + def to_s + "Hypergraph" + end +end + +class HG::Hyperedge + attr_accessor :head, :tails, :weight, :f, :mark, :rule, :left, :right + + def initialize head=nil, tails=[], weight=0.0, f={} + @head = head + @tails = tails + @weight = weight + @f = f + @mark = 0 + end + + def arity + return @tails.size + end + + def marked? + arity == @mark + end + + def to_s + "Hyperedge" + end +end + +def HG::topological_sort nodes + sorted = [] + s = nodes.reject { |n| !n.incoming.empty? } + while !s.empty? + sorted << s.shift + sorted.last.outgoing.each { |e| + next if e.marked? + e.mark += 1 + s << e.head if e.head.incoming.reject{ |f| f.mark==f.arity }.empty? + } + end + return sorted +end + +def HG::init nodes, semiring, root + nodes.each { |n| n.score=semiring.null } + root.score = semiring.one +end + +def HG::viterbi hypergraph, root, semiring=ViterbiSemiring.new + toposorted = topological_sort hypergraph.nodes + init toposorted, semiring, root + toposorted.each { |n| + n.incoming.each { |e| + s = semiring.one + e.tails.each { |m| + s = semiring.multiply.call(s, m.score) + } + n.score = semiring.add.call(n.score, semiring.multiply.call(s, e.weight)) + } + } +end + +def HG::viterbi_path hypergraph, root, semiring=ViterbiSemiring.new + toposorted = topological_sort hypergraph.nodes + init toposorted, semiring, root + best_path = [] + toposorted.each { |n| + best_edge = nil + n.incoming.each { |e| + s = semiring.one + e.tails.each { |m| + s = semiring.multiply.call(s, m.score) + } + if n.score < semiring.multiply.call(s, e.weight) # ViterbiSemiring add + best_edge = e + end + n.score = semiring.add.call(n.score, semiring.multiply.call(s, e.weight)) + } + best_path << best_edge + } + return best_path, toposorted.last.score +end + +def HG::read_hypergraph_from_json fn, semiring=RealSemiring.new, log_weights=false + nodes = [] + edges = [] + nodes_by_label = {} + nodes_by_index = [] + h = JSON.parse File.new(fn).read + w = SparseVector.from_h h['weights'] + h['nodes'].each { |i| + n = Node.new i['label'], i['cat'] + nodes << n + nodes_by_label[n.label] = n + nodes_by_index << n + } + h['edges'].each { |i| + e = Hyperedge.new(nodes_by_label[i['head']], \ + i['tails'].map{|j| nodes_by_label[j]}.to_a, \ + semiring.convert.call(i['weight'].to_f), \ + {}) + e.f = SparseVector.from_h i['f'] + if log_weights + e.weight = Math.exp(w.dot(e.f)) + else + e.weight = w.dot(e.f) + end + e.tails.each { |m| + m.outgoing << e + } + e.head.incoming << e + edges << e + } + return Hypergraph.new(nodes, edges), nodes_by_label, nodes_by_index +end + +def HG::all_paths hypergraph, root, semiring=ViterbiSemiring.new + toposorted = topological_sort hypergraph.nodes + paths = [[]] + toposorted.each { |n| + next if n.incoming.empty? + new_paths = [] + while !paths.empty? + p = paths.pop + n.incoming.each { |e| + new_paths << p+[e] + } + end + paths = new_paths + } + return paths +end + + +end #module + diff --git a/lib/zipf/misc.rb b/lib/zipf/misc.rb new file mode 100644 index 0000000..0319a5f --- /dev/null +++ b/lib/zipf/misc.rb @@ -0,0 +1,114 @@ +require 'timeout' + + +class Array + def max_index + self.index(self.max) + end + + def is_subset_of? other + self.each { |i| + if other.include? i + return false + end + } + return true + end + + def sum + self.inject(:+) + end + + def mean + self.sum.to_f/self.size + end +end + +class String + + def downcase? + self[/[[:lower:]]/] + end +end + +class PriorityQueue +# This assumes that elements in the queue +# have a numerical member named 'score'. + + def initialize a=Array.new + @queue = Array.new a + sort! + end + + def sort! + @queue.sort_by! { |i| -i.score } + end + + def pop + @queue.pop + end + + def push i + @queue << i + sort! + end + + def empty? + @queue.empty? + end +end + +def spawn_with_timeout cmd, t=4, ignore_fail=false, debug=false + STDERR.write cmd+"\n" if debug + pipe_in, pipe_out = IO.pipe + pid = Process.spawn(cmd, :out => pipe_out) + begin + Timeout.timeout(t) { Process.wait pid } + rescue Timeout::Error + Process.kill('TERM', pid) if !ignore_fail + end + pipe_out.close + return pipe_in.read +end + +def read_phrase_table fn + table = {} + f = ReadFile.new fn + while raw_rule = f.gets + french, english, features = splitpipe(raw_rule) + feature_map = SparseVector.from_kv features + if table.has_key? french + table[french] << [english, feature_map ] + else + table[french] = [[english, feature_map]] + end + end + f.close + return table +end + +def cdec_kbest cdec_bin, input, ini, weights, k, unique=true + require 'open3' + cmd = "echo \"#{input}\" | #{cdec_bin} -c #{ini} -w #{weights} -k #{k}" + cmd += " -r" if unique + o,_ = Open3.capture2 "#{cmd} 2>/dev/null" + a = []; j = -1 + o.split("\n").map{ |i| j+=1; t=Translation.new; t.from_s(i, false, j); a << t } + return a +end + +def read_config fn + f = ReadFile.new fn + cfg = {} + while line = f.gets + line.strip! + next if /^\s*$/.match line + next if line[0]=='#' + content = line.split('#', 2).first + k, v = content.split(/\s*=\s*/, 2) + k.strip!; v.strip! + cfg[k] = v + end + return cfg +end + diff --git a/lib/zipf/semirings.rb b/lib/zipf/semirings.rb new file mode 100644 index 0000000..920923f --- /dev/null +++ b/lib/zipf/semirings.rb @@ -0,0 +1,81 @@ +# Semirings for directed acyclic graphs (dags) (also directed hypergraphs), +# as described in: +# 'Dynamic Programming Algorithms in +# Semiring and Hypergraph Frameworks' (Liang Huang) +# + +class Semiring + attr_accessor :add, :multiply, :one, :null, :convert +end + +class BooleanSemiring < Semiring + def initialize + @add = Proc.new { |a,b| a||b } + @multiply = Proc.new { |a,b| a&&b } + @one = true + @null = false + @convert = Proc.new { |v| true && v!=0 } + end +end + +class ViterbiSemiring < Semiring + def initialize + @add = Proc.new { |a,b| [a,b].max } + @multiply = Proc.new { |a,b| a*b } + @one = 1.0 + @null = 0.0 + @convert = Proc.new { |v| v } + end +end + +class ViterbiLogSemiring < Semiring + def initialize + @add = Proc.new { |a,b| [a,b].max } + @multiply = Proc.new { |a,b| a+b } + @one = 0.0 + @null = -1.0/0.0 + @convert = Proc.new { |v| v } + end +end + +class InsideSemiring < Semiring + def initialize + @add = Proc.new { |a,b| a+b } + @multiply = Proc.new { |a,b| a*b } + @one = 1.0 + @null = 0.0 + @convert = Proc.new { |v| v } + end +end + +class RealSemiring < Semiring + def initialize + @add = Proc.new { |a,b| [a,b].min } + @multiply = Proc.new { |a,b| a+b } + @one = 0.0 + @null = 1.0/0.0 + @convert = Proc.new { |v| v } + end +end + +# for longest/worst paths +class RealxSemiring < Semiring + def initialize + @add = Proc.new { |a,b| [a,b].max } + @multiply = Proc.new { |a,b| a+b } + @one = -1.0/0.0 + @null = 0.0 + @convert = Proc.new { |v| v } + end +end + +class CountingSemiring < Semiring + def initialize + @add = Proc.new { |a,b| a+b } + @multiply = Proc.new { |a,b| a*b } + @one = 1.0 + @null = 0.0 + @convert = Proc.new { |v| if v!=0 then 1 else 0 end } + end +end + diff --git a/lib/zipf/stringutil.rb b/lib/zipf/stringutil.rb new file mode 100644 index 0000000..aa9be00 --- /dev/null +++ b/lib/zipf/stringutil.rb @@ -0,0 +1,22 @@ +def tokenize s + s.strip.split +end + +def ngrams(s, n, fix=false) + a = tokenize s + a.each_with_index { |tok, i| + tok.strip! + 0.upto([n-1, a.size-i-1].min) { |m| + yield a[i..i+m] if !fix||(fix&&a[i..i+m].size==n) + } + } +end + +def bag_of_words s, stopwords=[] + s.strip.split.uniq.sort.reject{ |w| stopwords.include? w } +end + +def splitpipe s, n=3 + s.strip.split("|"*n) +end + diff --git a/lib/zipf/tfidf.rb b/lib/zipf/tfidf.rb new file mode 100644 index 0000000..13a40a3 --- /dev/null +++ b/lib/zipf/tfidf.rb @@ -0,0 +1,38 @@ +module TFIDF + + +# returns key='raw frequency' for an +# array-like object +def TFIDF::tf array, stopwords=[] + v = {}; v.default = 0 + array.uniq.each { |i| + next if stopwords.include? i + v[i] = array.count(i).to_f + } + return v +end + +# smoothes raw frequencies of tf() in-place +# a is a smoothing term +def TFIDF::ntf hash, a=0.4 + max = hash.values.max.to_f + hash.each_pair { |k,v| + hash[k] = a + (1-a)*(v/max) + } +end + +# returns idf value for each word in a vocabulary +def TFIDF::idf list_of_hashes + vocab = list_of_hashes.values.flatten.uniq + n = list_of_hashes.size.to_f + idf = {} + vocab.each { |i| + df = list_of_hashes.values.flatten.count i + idf[i] = Math.log(n/df) + } + return idf +end + + +end #module + diff --git a/nlp_ruby.gemspec b/nlp_ruby.gemspec deleted file mode 100644 index 1393918..0000000 --- a/nlp_ruby.gemspec +++ /dev/null @@ -1,13 +0,0 @@ -Gem::Specification.new do |s| - s.name = 'nlp_ruby' - s.version = '0.4.4' - s.date = '2014-06-03' - s.summary = 'nlp_ruby' - s.description = 'NLP related tools and classes' - s.authors = ['Patrick Simianer'] - s.email = 'p@simianer.de' - s.files = Dir['lib/*.rb', 'lib/nlp_ruby/*.rb'] - s.homepage = 'http://simianer.de' - s.license = 'MIT' -end - diff --git a/test/test_dag.rb b/test/test_dag.rb index f5330da..5c3938b 100755 --- a/test/test_dag.rb +++ b/test/test_dag.rb @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'nlp_ruby' +require_relative '../lib/zipf' require 'test/unit' diff --git a/test/test_hg.rb b/test/test_hg.rb index 50fedc0..667a3b8 100755 --- a/test/test_hg.rb +++ b/test/test_hg.rb @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -require 'nlp_ruby' +require_relative '../lib/zipf' require 'test/unit' diff --git a/zipf.gemspec b/zipf.gemspec new file mode 100644 index 0000000..5873037 --- /dev/null +++ b/zipf.gemspec @@ -0,0 +1,13 @@ +Gem::Specification.new do |s| + s.name = 'zipf' + s.version = '1.0' + s.date = '2014-06-16' + s.summary = 'nlp_ruby' + s.description = 'NLP related tools and classes' + s.authors = ['Patrick Simianer'] + s.email = 'p@simianer.de' + s.files = Dir['lib/*.rb', 'lib/zipf/*.rb'] + s.homepage = 'http://simianer.de' + s.license = 'MIT' +end + -- cgit v1.2.3