From 4059a5d048cb0f72872c98073ef1ce120a30d78c Mon Sep 17 00:00:00 2001
From: Patrick Simianer
Date: Mon, 16 Jun 2014 17:44:07 +0200
Subject: renaming to zipf
---
lib/nlp_ruby.rb | 18 ----
lib/nlp_ruby/SparseVector.rb | 172 ------------------------------------
lib/nlp_ruby/Translation.rb | 72 ---------------
lib/nlp_ruby/bleu.rb | 130 ---------------------------
lib/nlp_ruby/dag.rb | 205 -------------------------------------------
lib/nlp_ruby/fileutil.rb | 88 -------------------
lib/nlp_ruby/grammar.rb | 122 -------------------------
lib/nlp_ruby/hg.rb | 173 ------------------------------------
lib/nlp_ruby/misc.rb | 114 ------------------------
lib/nlp_ruby/semirings.rb | 79 -----------------
lib/nlp_ruby/stringutil.rb | 22 -----
lib/nlp_ruby/tfidf.rb | 38 --------
lib/zipf.rb | 18 ++++
lib/zipf/SparseVector.rb | 172 ++++++++++++++++++++++++++++++++++++
lib/zipf/Translation.rb | 72 +++++++++++++++
lib/zipf/bleu.rb | 130 +++++++++++++++++++++++++++
lib/zipf/dag.rb | 205 +++++++++++++++++++++++++++++++++++++++++++
lib/zipf/fileutil.rb | 88 +++++++++++++++++++
lib/zipf/grammar.rb | 123 ++++++++++++++++++++++++++
lib/zipf/hg.rb | 173 ++++++++++++++++++++++++++++++++++++
lib/zipf/misc.rb | 114 ++++++++++++++++++++++++
lib/zipf/semirings.rb | 81 +++++++++++++++++
lib/zipf/stringutil.rb | 22 +++++
lib/zipf/tfidf.rb | 38 ++++++++
24 files changed, 1236 insertions(+), 1233 deletions(-)
delete mode 100755 lib/nlp_ruby.rb
delete mode 100644 lib/nlp_ruby/SparseVector.rb
delete mode 100644 lib/nlp_ruby/Translation.rb
delete mode 100644 lib/nlp_ruby/bleu.rb
delete mode 100644 lib/nlp_ruby/dag.rb
delete mode 100644 lib/nlp_ruby/fileutil.rb
delete mode 100644 lib/nlp_ruby/grammar.rb
delete mode 100644 lib/nlp_ruby/hg.rb
delete mode 100644 lib/nlp_ruby/misc.rb
delete mode 100644 lib/nlp_ruby/semirings.rb
delete mode 100644 lib/nlp_ruby/stringutil.rb
delete mode 100644 lib/nlp_ruby/tfidf.rb
create mode 100755 lib/zipf.rb
create mode 100644 lib/zipf/SparseVector.rb
create mode 100644 lib/zipf/Translation.rb
create mode 100644 lib/zipf/bleu.rb
create mode 100644 lib/zipf/dag.rb
create mode 100644 lib/zipf/fileutil.rb
create mode 100644 lib/zipf/grammar.rb
create mode 100644 lib/zipf/hg.rb
create mode 100644 lib/zipf/misc.rb
create mode 100644 lib/zipf/semirings.rb
create mode 100644 lib/zipf/stringutil.rb
create mode 100644 lib/zipf/tfidf.rb
(limited to 'lib')
diff --git a/lib/nlp_ruby.rb b/lib/nlp_ruby.rb
deleted file mode 100755
index 0e26e97..0000000
--- a/lib/nlp_ruby.rb
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/usr/bin/env ruby
-
-require 'nlp_ruby/stringutil'
-require 'nlp_ruby/fileutil'
-require 'nlp_ruby/SparseVector'
-require 'nlp_ruby/tfidf'
-require 'nlp_ruby/Translation'
-require 'nlp_ruby/dag'
-require 'nlp_ruby/semirings'
-require 'nlp_ruby/bleu'
-require 'nlp_ruby/misc'
-require 'nlp_ruby/hg'
-require 'nlp_ruby/grammar'
-
-STDIN.set_encoding 'utf-8'
-STDOUT.set_encoding 'utf-8'
-STDERR.set_encoding 'utf-8'
-
diff --git a/lib/nlp_ruby/SparseVector.rb b/lib/nlp_ruby/SparseVector.rb
deleted file mode 100644
index 3096412..0000000
--- a/lib/nlp_ruby/SparseVector.rb
+++ /dev/null
@@ -1,172 +0,0 @@
-class SparseVector < Hash
-
- def initialize arg=nil
- super
- self.default = 0
- if arg.is_a? Array
- from_a arg
- end
- end
-
- def from_a a
- a.each_with_index { |i,j| self[j] = i }
- end
-
- def self.from_a a
- v = SparseVector.new
- v.from_a a
- return v
- end
-
- def from_h h
- h.each_pair { |k,v| self[k] = v }
- end
-
- def self.from_h h
- v = SparseVector.new
- v.from_h h
- return v
- end
-
- def from_s s
- from_h eval(s)
- end
-
- def self.from_s s
- v = SparseVector.new
- v.from_s s
- return v
- end
-
- def to_kv sep='=', join=' '
- a = []
- self.each_pair { |k,v|
- a << "#{k}#{sep}#{v}"
- }
- return a.join join
- end
-
- def from_kv s
- s.split.each { |i|
- k,v = i.split('=')
- self[k] = v.to_f
- }
- end
-
- def self.from_kv s
- v = SparseVector.new
- v.from_kv s
- return v
- end
-
- def from_file fn, sep='='
- f = ReadFile.new(fn)
- while line = f.gets
- key, value = line.strip.split sep
- value = value.to_f
- self[key] = value
- end
- end
-
- def self.from_file fn, sep='='
- v = SparseVector.new
- v.from_file fn, sep
- return v
- end
-
- def join_keys other
- self.keys + other.keys
- end
-
- def sum
- self.values.inject(:+)
- end
-
- def approx_eql? other, p=10**-10
- return false if !other
- return false if other.size!=self.size
- return false if other.keys.sort!=self.keys.sort
- self.keys.each { |k|
- return false if (self[k]-other[k]).abs>p
- }
- return true
- end
-
- def average
- self.sum/self.size.to_f
- end
-
- def variance
- avg = self.average
- var = 0.0
- self.values.each { |i| var += (avg - i)**2 }
- return var
- end
-
- def stddev
- Math.sqrt self.variance
- end
-
- def dot other
- sum = 0.0
- self.each_pair { |k,v| sum += v * other[k] }
- return sum
- end
-
- def zeros n
- (0).upto(n-1) { |i| self[i] = 0.0 }
- end
-
- def magnitude
- Math.sqrt self.values.inject { |sum,i| sum+i**2 }
- end
-
- def cosinus_sim other
- self.dot(other)/(self.magnitude*other.magnitude)
- end
-
- def euclidian_dist other
- dims = [self.keys, other.keys].flatten.uniq
- sum = 0.0
- dims.each { |d| sum += (self[d] - other[d])**2 }
- return Math.sqrt(sum)
- end
-
- def + other
- new = SparseVector.new
- join_keys(other).each { |k|
- new[k] = self[k]+other[k]
- }
- return new
- end
-
- def - other
- new = SparseVector.new
- join_keys(other).each { |k|
- new[k] = self[k]-other[k]
- }
- return new
- end
-
- def * scalar
- raise ArgumentError, "Arg is not numeric #{scalar}" unless scalar.is_a? Numeric
- new = SparseVector.new
- self.keys.each { |k|
- new[k] = self[k] * scalar
- }
- return new
- end
-
- def self.mean a
- mean = SparseVector.new
- a.each { |i|
- i.each_pair { |k,v|
- mean[k] += v
- }
- }
- n = a.size.to_f
- mean.each_pair { |k,v| mean[k] = v/n }
- return mean
- end
-end
-
diff --git a/lib/nlp_ruby/Translation.rb b/lib/nlp_ruby/Translation.rb
deleted file mode 100644
index 3759a1d..0000000
--- a/lib/nlp_ruby/Translation.rb
+++ /dev/null
@@ -1,72 +0,0 @@
-class Translation
- attr_accessor :id, :s, :raw, :f, :scores, :rank
-
- def initialize id=nil, raw=nil, s=nil, f=nil, scores={}, rank=nil
- @id = id
- @raw = raw
- @s = s
- @f = f
- @scores = scores
- @rank = rank
- end
-
- def from_s t, strip_alignment=true, rank=nil
- id, raw, features, score = splitpipe(t, 3)
- raw.strip!
- @raw = raw
- if strip_alignment # the way moses does it
- @s = @raw.gsub(/\s*\|\d+-\d+\||\|-?\d+\|\s*/, ' ').gsub(/\s+/, ' ')
- @s.strip!
- else
- @s = raw
- end
- @id = id.to_i
- @f = SparseVector.from_kv features
- @scores[:decoder] = score.to_f
- @rank = rank
- end
-
- def self.from_s s
- t = self.new
- t.from_s s
- return t
- end
-
- def to_s include_features=true
- [@id, @s, @f.to_kv('=', ' '), @scores[:decoder]].join(' ||| ') if include_features
- [@id, @s, @scores[:decoder]].join(' ||| ') if !include_features
- end
-
- def to_s2
- [@rank, @s, @score, @scores.to_s].join ' ||| '
- end
-end
-
-def read_kbest_lists fn, translation_type=Translation
- kbest_lists = []
- cur = []
- f = ReadFile.new fn
- prev = -1
- c = 0
- id = 0
- while line = f.gets
- t = translation_type.new
- t.from_s line
- c = splitpipe(line)[0].to_i
- if c != prev
- if cur.size > 0
- kbest_lists << cur
- cur = []
- end
- prev = c
- id = 0
- end
- t.id = id
- cur << t
- id += 1
- end
- kbest_lists << cur # last one
- f.close
- return kbest_lists
-end
-
diff --git a/lib/nlp_ruby/bleu.rb b/lib/nlp_ruby/bleu.rb
deleted file mode 100644
index 56f341b..0000000
--- a/lib/nlp_ruby/bleu.rb
+++ /dev/null
@@ -1,130 +0,0 @@
-module BLEU
-
-
-class BLEU::NgramCounts
- attr_accessor :sum, :clipped, :ref_len, :hyp_len, :n
-
- def initialize(n)
- @n = 0
- @sum = []
- @clipped = []
- @ref_len = 0.0
- @hyp_len = 0.0
- grow(n)
- end
-
- def grow(n)
- (n-@n).times {
- @sum << 0.0
- @clipped << 0.0
- }
- @n = n
- end
-
- def plus_eq(other)
- if other.n > @n then grow(other.n) end
- 0.upto(other.n-1) { |m|
- @sum[m] += other.sum[m]
- @clipped[m] += other.clipped[m]
- }
- @ref_len += other.ref_len
- @hyp_len += other.hyp_len
- end
-
- def to_s
- return "n=#{n} sum=#{sum} clipped=#{clipped} ref_len=#{ref_len} hyp_len=#{hyp_len}"
- end
-end
-
-class BLEU::Ngrams
- def initialize
- @h_ = {}
- @h_.default = 0
- end
-
- def add(k)
- if k.class == Array then k = k.join ' ' end
- @h_[k] += 1
- end
-
- def get_count(k)
- if k.class == Array then k = k.join ' ' end
- return @h_[k]
- end
-
- def each
- @h_.each_pair { |k,v|
- yield k.split, v
- }
- end
-
- def to_s
- @h_.to_s
- end
-end
-
-def BLEU::get_counts hypothesis, reference, n, times=1
- p = NgramCounts.new n
- r = Ngrams.new
- ngrams(reference, n) { |ng| r.add ng }
- h = Ngrams.new
- ngrams(hypothesis, n) { |ng| h.add ng }
- h.each { |ng,count|
- sz = ng.size-1
- p.sum[sz] += count * times
- p.clipped[sz] += [r.get_count(ng), count].min * times
- }
- p.ref_len = tokenize(reference.strip).size * times
- p.hyp_len = tokenize(hypothesis.strip).size * times
- return p
-end
-
-def BLEU::brevity_penalty c, r, smooth=0.0
- return [0.0, 1.0-((r+smooth)/c)].min
-end
-
-def BLEU::bleu counts, n, debug=false
- corpus_stats = NgramCounts.new n
- counts.each { |i| corpus_stats.plus_eq i }
- logbleu = 0.0
- 0.upto(n-1) { |m|
- STDERR.write "#{m+1} #{corpus_stats.clipped[m]} / #{corpus_stats.sum[m]}\n" if debug
- return 0.0 if corpus_stats.clipped[m] == 0 or corpus_stats.sum == 0
- logbleu += Math.log(corpus_stats.clipped[m]) - Math.log(corpus_stats.sum[m])
- }
- logbleu /= n
- if debug
- STDERR.write "BP #{brevity_penalty(corpus_stats.hyp_len, corpus_stats.ref_len)}\n"
- STDERR.write "sum #{Math.exp(sum)}\n"
- end
- logbleu += brevity_penalty corpus_stats.hyp_len, corpus_stats.ref_len
- return Math.exp logbleu
-end
-
-def BLEU::hbleu counts, n, debug=false
- (100*bleu(counts, n, debug)).round(3)
-end
-
-def BLEU::per_sentence_bleu hypothesis, reference, n=4, smooth=0.0
- h_ng = {}; r_ng = {}
- (1).upto(n) { |i| h_ng[i] = []; r_ng[i] = [] }
- ngrams(hypothesis, n) { |i| h_ng[i.size] << i }
- ngrams(reference, n) { |i| r_ng[i.size] << i }
- m = [n, reference.split.size].min
- add = 0.0
- logbleu = 0.0
- (1).upto(m) { |i|
- counts_clipped = 0
- counts_sum = h_ng[i].size
- h_ng[i].uniq.each { |j| counts_clipped += r_ng[i].count(j) }
- add = 1.0 if i >= 2
- logbleu += Math.log(counts_clipped+add) - Math.log(counts_sum+add);
- }
- logbleu /= m
- logbleu += brevity_penalty hypothesis.strip.split.size, reference.strip.split.size, smooth
- return Math.exp logbleu
-end
-
-
-end # module
-
diff --git a/lib/nlp_ruby/dag.rb b/lib/nlp_ruby/dag.rb
deleted file mode 100644
index 94afa23..0000000
--- a/lib/nlp_ruby/dag.rb
+++ /dev/null
@@ -1,205 +0,0 @@
-module DAG
-
-require 'json'
-
-
-class DAG::Node
- attr_accessor :label, :outgoing, :incoming, :score, :mark
-
- def initialize label=nil, outgoing=[], incoming=[], score=nil
- @label = label
- @outgoing = outgoing
- @incoming = incoming
- @score = nil
- end
-
- def add_edge head, weight=0
- exit if self==head # no self-cycles!
- @outgoing << DAG::Edge.new(self, head, weight)
- return @outgoing.last
- end
-
- def to_s
- "DAG::Node"
- end
-
- def repr
- "#{to_s} #{@score} out:#{@outgoing} in:[#{@incoming.map{|e| e.to_s}.join ', '}]"
- end
-end
-
-class DAG::Edge
- attr_accessor :tail, :head, :weight, :mark
-
- def initialize tail=nil, head=nil, weight=0
- @tail = tail
- @head = head
- @weight = weight
- @mark = false # did we already follow this edge? -- for topological sorting
- end
-
- def to_s
- s = "DAG::Edge<#{@tail} ->[#{weight}] #{@head}"
- s += " x" if @mark
- s += ">"
- s
- end
-end
-
-# depth-first search
-# w/o markings as we do not have cycles
-def DAG::dfs n, target_label
- return n if n.label==target_label # assumes uniq labels!
- stack = n.outgoing.map { |i| i.head }
- while !stack.empty?
- m = stack.pop
- return DAG::dfs m, target_label
- end
- return nil
-end
-
-# breadth-first search
-# w/o markings as we do not have cycles
-def DAG::bfs n, target_label
- queue = [n]
- while !queue.empty?
- m = queue.shift
- return m if m.label==target_label
- m.outgoing.each { |e| queue << e.head }
- end
- return nil
-end
-
-# topological sort
-def DAG::topological_sort graph
- sorted = []
- s = graph.reject { |n| !n.incoming.empty? }
- while !s.empty?
- sorted << s.shift
- sorted.last.outgoing.each { |e|
- e.mark = true
- s << e.head if e.head.incoming.reject{|f| f.mark}.empty?
- }
- end
- return sorted
-end
-
-# initialize graph scores with semiring One
-def DAG::init graph, semiring, source_node
- graph.each {|n| n.score=semiring.null}
- source_node.score = semiring.one
-end
-
-# viterbi
-def DAG::viterbi graph, semiring=ViterbiSemiring, source_node
- toposorted = DAG::topological_sort(graph)
- DAG::init(graph, semiring, source_node)
- toposorted.each { |n|
- n.incoming.each { |e|
- # update
- n.score = \
- semiring.add.call(n.score, \
- semiring.multiply.call(e.tail.score, e.weight)
- )
- }
- }
-end
-
-# forward viterbi
-def DAG::viterbi_forward graph, semiring=ViterbiSemiring, source_node
- toposorted = DAG::topological_sort(graph)
- DAG::init(graph, semiring, source_node)
- toposorted.each { |n|
- n.outgoing.each { |e|
- e.head.score = \
- semiring.add.call(e.head.score, \
- semiring.multiply.call(n.score, e.weight)
- )
- }
- }
-end
-
-# Dijkstra algorithm
-# for A*-search we would need an optimistic estimate of
-# future cost at each node
-def DAG::dijkstra graph, semiring=RealSemiring.new, source_node
- DAG::init(graph, semiring, source_node)
- q = PriorityQueue.new graph
- while !q.empty?
- n = q.pop
- n.outgoing.each { |e|
- e.head.score = \
- semiring.add.call(e.head.score, \
- semiring.multiply.call(n.score, e.weight))
- q.sort!
- }
- end
-end
-
-# Bellman-Ford algorithm
-def DAG::bellman_ford(graph, semiring=RealSemiring.new, source_node)
- DAG::init(graph, semiring, source_node)
- edges = []
- graph.each { |n| edges |= n.outgoing }
- # relax edges
- (graph.size-1).times{ |i|
- edges.each { |e|
- e.head.score = \
- semiring.add.call(e.head.score, \
- semiring.multiply.call(e.tail.score, e.weight))
- }
- }
- # we do not allow cycles (negative or positive)
-end
-
-# Floyd algorithm
-def DAG::floyd(graph, semiring=nil)
- dist_matrix = []
- graph.each_index { |i|
- dist_matrix << []
- graph.each_index { |j|
- val = 1.0/0.0
- val = 0.0 if i==j
- dist_matrix.last << val
- }
- }
- edges = []
- graph.each { |n| edges |= n.outgoing }
- edges.each { |e|
- dist_matrix[graph.index(e.tail)][graph.index(e.head)] = e.weight
- }
- 0.upto(graph.size-1) { |k|
- 0.upto(graph.size-1) { |i|
- 0.upto(graph.size-1) { |j|
- if dist_matrix[i][k] + dist_matrix[k][j] < dist_matrix[i][j]
- dist_matrix [i][j] = dist_matrix[i][k] + dist_matrix[k][j]
- end
- }
- }
- }
- return dist_matrix
-end
-
-
-# returns a list of nodes (graph) and a hash for finding
-# nodes by their label (these need to be unique!)
-def DAG::read_graph_from_json fn, semiring=RealSemiring.new
- graph = []
- nodes_by_label = {}
- h = JSON.parse File.new(fn).read
- h['nodes'].each { |i|
- n = DAG::Node.new i['label']
- graph << n
- nodes_by_label[n.label] = n
- }
- h['edges'].each { |i|
- n = nodes_by_label[i['tail']]
- a = n.add_edge(nodes_by_label[i['head']], semiring.convert.call(i['weight'].to_f))
- nodes_by_label[i['head']].incoming << a
- }
- return graph, nodes_by_label
-end
-
-
-end # module
-
diff --git a/lib/nlp_ruby/fileutil.rb b/lib/nlp_ruby/fileutil.rb
deleted file mode 100644
index eb69136..0000000
--- a/lib/nlp_ruby/fileutil.rb
+++ /dev/null
@@ -1,88 +0,0 @@
-require 'zlib'
-
-
-class ReadFile
-
- def initialize fn, encoding='utf-8'
- if fn.split('.').last == 'gz'
- @f = Zlib::GzipReader.new(File.new(fn, 'rb'), :external_encoding=>encoding)
- elsif fn == '-'
- @f = STDIN
- STDIN.set_encoding encoding
- else
- @f = File.new fn, 'r'
- @f.set_encoding encoding
- end
- end
-
- def gets
- @f.gets { |line| yield line }
- end
-
- def readlines
- @f.readlines
- end
-
- def self.readlines fn, encoding='utf-8'
- f = ReadFile.new fn, encoding
- r = f.readlines
- f.close
- return r
- end
-
- def readlines_strip
- self.readlines.map{ |i| i.strip }
- end
-
- def self.readlines_strip fn, encoding='utf-8'
- f = ReadFile.new fn, encoding
- r = f.readlines_strip
- f.close
- return r
- end
-
- def read
- @f.read
- end
-
- def self.read fn, encoding='utf-8'
- f = ReadFile.new fn, encoding
- r = f.read
- f.close
- return r
- end
-
- def close
- @f.close if @f!=STDIN
- end
-end
-
-class WriteFile
-
- def initialize fn, encoding='utf-8'
- if fn.split('.').last == 'gz'
- @f = Zlib::GzipWriter.new(File.new(fn, 'wb+'), :external_encoding=>encoding)
- elsif fn == '-'
- @f = STDOUT
- STDOUT.set_encoding encoding
- else
- @f = File.new fn, 'w+'
- @f.set_encoding encoding
- end
- end
-
- def write s
- @f.write s
- end
-
- def self.write s, fn, encoding='utf-8'
- f = WriteFile.new fn, encoding
- f.write s
- f.close
- end
-
- def close
- @f.close if @f!=STDIN
- end
-end
-
diff --git a/lib/nlp_ruby/grammar.rb b/lib/nlp_ruby/grammar.rb
deleted file mode 100644
index 7bd8fe6..0000000
--- a/lib/nlp_ruby/grammar.rb
+++ /dev/null
@@ -1,122 +0,0 @@
-module Grammar
-
-class T
- attr_accessor :word
-
- def initialize word
- @word = word
- end
-
- def to_s
- "T<#{@word}>"
- end
-end
-
-class NT
- attr_accessor :symbol, :index, :span
-
- def initialize symbol, index=0
- @symbol = symbol
- @index = index
- @span = Span.new
- end
-
- def to_s
- "NT(#{@span.left},#{@span.right})<#{@symbol},#{@index}>"
- end
-end
-
-class Rule
- attr_accessor :lhs, :rhs, :e
-
- def initialize lhs=nil, rhs=[], e=''
- @lhs = lhs
- @rhs = rhs
- @e = e
- end
-
- def to_s
- "#{lhs} -> #{rhs.map{ |i| i.to_s }.join ' '} [arity=#{arity}] ||| #{@e}"
- end
-
- def arity
- rhs.select { |i| i.class == NT }.size
- end
-
- def from_s s
- _ = splitpipe s, 3
- @lhs = NT.new _[0].strip.gsub!(/(\[|\])/, "")
- _[1].split.each { |x|
- x.strip!
- if x[0]=='[' && x[x.size-1] == ']'
- @rhs << NT.new(x.gsub!(/(\[|\])/, "").split(',')[0])
- else
- @rhs << T.new(x)
- end
- }
- @e = _[2]
- end
-
- def self.from_s s
- r = self.new
- r.from_s s
- return r
- end
-end
-
-class Span
- attr_accessor :left, :right
-
- def initialize left=nil, right=nil
- @left = left
- @right = right
- end
-end
-
-class Grammar
- attr_accessor :rules, :startn, :startt, :flat
-
- def initialize fn
- @rules = []; @startn = []; @startt = [] ;@flat = []
- ReadFile.readlines_strip(fn).each_with_index { |s,i|
- STDERR.write '.'; STDERR.write " #{i+1}\n" if (i+1)%80==0
- @rules << Rule.from_s(s)
- if @rules.last.rhs.first.class == NT
- @startn << @rules.last
- else
- if rules.last.arity == 0
- @flat << @rules.last
- else
- @startt << @rules.last
- end
- end
- }
- STDERR.write "\n"
- end
-
- def to_s
- s = ''
- @rules.each { |r| s += r.to_s+"\n" }
- return s
- end
-
- def add_glue_rules
- @rules.map { |r| r.lhs.symbol }.select { |s| s != 'S' }.uniq.each { |symbol|
- @rules << Rule.new(NT.new('S'), [NT.new(symbol)])
- @startn << @rules.last
- @rules << Rule.new(NT.new('S'), [NT.new('S'), NT.new('X')])
- @startn << @rules.last
- }
- end
-
- def add_pass_through_rules s
- s.each { |word|
- @rules << Rule.new(NT.new('X'), [T.new(word)])
- @flat << @rules.last
- }
- end
-end
-
-
-end # module
-
diff --git a/lib/nlp_ruby/hg.rb b/lib/nlp_ruby/hg.rb
deleted file mode 100644
index b8b147e..0000000
--- a/lib/nlp_ruby/hg.rb
+++ /dev/null
@@ -1,173 +0,0 @@
-#!/usr/bin/env ruby
-
-require 'nlp_ruby'
-require 'json'
-
-
-module HG
-
-
-class HG::Node
- attr_accessor :label, :cat, :outgoing, :incoming, :score
-
- def initialize label=nil, cat=nil, outgoing=[], incoming=[], score=nil
- @label = label
- @cat = cat
- @outgoing = outgoing
- @incoming = incoming
- @score = nil
- end
-
- def to_s
- "Node"
- end
-end
-
-class HG::Hypergraph
- attr_accessor :nodes, :edges
-
- def initialize nodes=[], edges=[]
- @nodes = nodes
- @edges = edges
- end
-
- def arity
- @edges.map { |e| e.arity }.max
- end
-
- def to_s
- "Hypergraph"
- end
-end
-
-class HG::Hyperedge
- attr_accessor :head, :tails, :weight, :f, :mark, :rule, :left, :right
-
- def initialize head=nil, tails=[], weight=0.0, f={}
- @head = head
- @tails = tails
- @weight = weight
- @f = f
- @mark = 0
- end
-
- def arity
- return @tails.size
- end
-
- def marked?
- arity == @mark
- end
-
- def to_s
- "Hyperedge"
- end
-end
-
-def HG::topological_sort nodes
- sorted = []
- s = nodes.reject { |n| !n.incoming.empty? }
- while !s.empty?
- sorted << s.shift
- sorted.last.outgoing.each { |e|
- next if e.marked?
- e.mark += 1
- s << e.head if e.head.incoming.reject{ |f| f.mark==f.arity }.empty?
- }
- end
- return sorted
-end
-
-def HG::init nodes, semiring, root
- nodes.each { |n| n.score=semiring.null }
- root.score = semiring.one
-end
-
-def HG::viterbi hypergraph, root, semiring=ViterbiSemiring.new
- toposorted = topological_sort hypergraph.nodes
- init toposorted, semiring, root
- toposorted.each { |n|
- n.incoming.each { |e|
- s = semiring.one
- e.tails.each { |m|
- s = semiring.multiply.call(s, m.score)
- }
- n.score = semiring.add.call(n.score, semiring.multiply.call(s, e.weight))
- }
- }
-end
-
-def HG::viterbi_path hypergraph, root, semiring=ViterbiSemiring.new
- toposorted = topological_sort hypergraph.nodes
- init toposorted, semiring, root
- best_path = []
- toposorted.each { |n|
- best_edge = nil
- n.incoming.each { |e|
- s = semiring.one
- e.tails.each { |m|
- s = semiring.multiply.call(s, m.score)
- }
- if n.score < semiring.multiply.call(s, e.weight) # ViterbiSemiring add
- best_edge = e
- end
- n.score = semiring.add.call(n.score, semiring.multiply.call(s, e.weight))
- }
- best_path << best_edge
- }
- return best_path, toposorted.last.score
-end
-
-def HG::read_hypergraph_from_json fn, semiring=RealSemiring.new, log_weights=false
- nodes = []
- edges = []
- nodes_by_label = {}
- nodes_by_index = []
- h = JSON.parse File.new(fn).read
- w = SparseVector.from_h h['weights']
- h['nodes'].each { |i|
- n = Node.new i['label'], i['cat']
- nodes << n
- nodes_by_label[n.label] = n
- nodes_by_index << n
- }
- h['edges'].each { |i|
- e = Hyperedge.new(nodes_by_label[i['head']], \
- i['tails'].map{|j| nodes_by_label[j]}.to_a, \
- semiring.convert.call(i['weight'].to_f), \
- {})
- e.f = SparseVector.from_h i['f']
- if log_weights
- e.weight = Math.exp(w.dot(e.f))
- else
- e.weight = w.dot(e.f)
- end
- e.tails.each { |m|
- m.outgoing << e
- }
- e.head.incoming << e
- edges << e
- }
- return Hypergraph.new(nodes, edges), nodes_by_label, nodes_by_index
-end
-
-def HG::all_paths hypergraph, root, semiring=ViterbiSemiring.new
- toposorted = topological_sort hypergraph.nodes
- paths = [[]]
- toposorted.each { |n|
- next if n.incoming.empty?
- new_paths = []
- while !paths.empty?
- p = paths.pop
- n.incoming.each { |e|
- new_paths << p+[e]
- }
- end
- paths = new_paths
- }
- return paths
-end
-
-
-end #module
-
diff --git a/lib/nlp_ruby/misc.rb b/lib/nlp_ruby/misc.rb
deleted file mode 100644
index 0319a5f..0000000
--- a/lib/nlp_ruby/misc.rb
+++ /dev/null
@@ -1,114 +0,0 @@
-require 'timeout'
-
-
-class Array
- def max_index
- self.index(self.max)
- end
-
- def is_subset_of? other
- self.each { |i|
- if other.include? i
- return false
- end
- }
- return true
- end
-
- def sum
- self.inject(:+)
- end
-
- def mean
- self.sum.to_f/self.size
- end
-end
-
-class String
-
- def downcase?
- self[/[[:lower:]]/]
- end
-end
-
-class PriorityQueue
-# This assumes that elements in the queue
-# have a numerical member named 'score'.
-
- def initialize a=Array.new
- @queue = Array.new a
- sort!
- end
-
- def sort!
- @queue.sort_by! { |i| -i.score }
- end
-
- def pop
- @queue.pop
- end
-
- def push i
- @queue << i
- sort!
- end
-
- def empty?
- @queue.empty?
- end
-end
-
-def spawn_with_timeout cmd, t=4, ignore_fail=false, debug=false
- STDERR.write cmd+"\n" if debug
- pipe_in, pipe_out = IO.pipe
- pid = Process.spawn(cmd, :out => pipe_out)
- begin
- Timeout.timeout(t) { Process.wait pid }
- rescue Timeout::Error
- Process.kill('TERM', pid) if !ignore_fail
- end
- pipe_out.close
- return pipe_in.read
-end
-
-def read_phrase_table fn
- table = {}
- f = ReadFile.new fn
- while raw_rule = f.gets
- french, english, features = splitpipe(raw_rule)
- feature_map = SparseVector.from_kv features
- if table.has_key? french
- table[french] << [english, feature_map ]
- else
- table[french] = [[english, feature_map]]
- end
- end
- f.close
- return table
-end
-
-def cdec_kbest cdec_bin, input, ini, weights, k, unique=true
- require 'open3'
- cmd = "echo \"#{input}\" | #{cdec_bin} -c #{ini} -w #{weights} -k #{k}"
- cmd += " -r" if unique
- o,_ = Open3.capture2 "#{cmd} 2>/dev/null"
- a = []; j = -1
- o.split("\n").map{ |i| j+=1; t=Translation.new; t.from_s(i, false, j); a << t }
- return a
-end
-
-def read_config fn
- f = ReadFile.new fn
- cfg = {}
- while line = f.gets
- line.strip!
- next if /^\s*$/.match line
- next if line[0]=='#'
- content = line.split('#', 2).first
- k, v = content.split(/\s*=\s*/, 2)
- k.strip!; v.strip!
- cfg[k] = v
- end
- return cfg
-end
-
diff --git a/lib/nlp_ruby/semirings.rb b/lib/nlp_ruby/semirings.rb
deleted file mode 100644
index fda4683..0000000
--- a/lib/nlp_ruby/semirings.rb
+++ /dev/null
@@ -1,79 +0,0 @@
-# Semirings for directed acyclic graphs (dags) (also directed hypergraphs),
-# as described in:
-# 'Dynamic Programming Algorithms in
-# Semiring and Hypergraph Frameworks' (Liang Huang)
-class Semiring
- attr_accessor :add, :multiply, :one, :null, :convert
-end
-
-class BooleanSemiring < Semiring
- def initialize
- @add = Proc.new { |a,b| a||b }
- @multiply = Proc.new { |a,b| a&&b }
- @one = true
- @null = false
- @convert = Proc.new { |v| true && v!=0 }
- end
-end
-
-class ViterbiSemiring < Semiring
- def initialize
- @add = Proc.new { |a,b| [a,b].max }
- @multiply = Proc.new { |a,b| a*b }
- @one = 1.0
- @null = 0.0
- @convert = Proc.new { |v| v }
- end
-end
-
-class ViterbiLogSemiring < Semiring
- def initialize
- @add = Proc.new { |a,b| [a,b].max }
- @multiply = Proc.new { |a,b| a+b }
- @one = 0.0
- @null = -1.0/0.0
- @convert = Proc.new { |v| v }
- end
-end
-
-class InsideSemiring < Semiring
- def initialize
- @add = Proc.new { |a,b| a+b }
- @multiply = Proc.new { |a,b| a*b }
- @one = 1.0
- @null = 0.0
- @convert = Proc.new { |v| v }
- end
-end
-
-class RealSemiring < Semiring
- def initialize
- @add = Proc.new { |a,b| [a,b].min }
- @multiply = Proc.new { |a,b| a+b }
- @one = 0.0
- @null = 1.0/0.0
- @convert = Proc.new { |v| v }
- end
-end
-
-# for longest/worst paths
-class RealxSemiring < Semiring
- def initialize
- @add = Proc.new { |a,b| [a,b].max }
- @multiply = Proc.new { |a,b| a+b }
- @one = -1.0/0.0
- @null = 0.0
- @convert = Proc.new { |v| v }
- end
-end
-
-class CountingSemiring < Semiring
- def initialize
- @add = Proc.new { |a,b| a+b }
- @multiply = Proc.new { |a,b| a*b }
- @one = 1.0
- @null = 0.0
- @convert = Proc.new { |v| if v!=0 then 1 else 0 end }
- end
-end
-
diff --git a/lib/nlp_ruby/stringutil.rb b/lib/nlp_ruby/stringutil.rb
deleted file mode 100644
index aa9be00..0000000
--- a/lib/nlp_ruby/stringutil.rb
+++ /dev/null
@@ -1,22 +0,0 @@
-def tokenize s
- s.strip.split
-end
-
-def ngrams(s, n, fix=false)
- a = tokenize s
- a.each_with_index { |tok, i|
- tok.strip!
- 0.upto([n-1, a.size-i-1].min) { |m|
- yield a[i..i+m] if !fix||(fix&&a[i..i+m].size==n)
- }
- }
-end
-
-def bag_of_words s, stopwords=[]
- s.strip.split.uniq.sort.reject{ |w| stopwords.include? w }
-end
-
-def splitpipe s, n=3
- s.strip.split("|"*n)
-end
-
diff --git a/lib/nlp_ruby/tfidf.rb b/lib/nlp_ruby/tfidf.rb
deleted file mode 100644
index 13a40a3..0000000
--- a/lib/nlp_ruby/tfidf.rb
+++ /dev/null
@@ -1,38 +0,0 @@
-module TFIDF
-
-
-# returns key='raw frequency' for an
-# array-like object
-def TFIDF::tf array, stopwords=[]
- v = {}; v.default = 0
- array.uniq.each { |i|
- next if stopwords.include? i
- v[i] = array.count(i).to_f
- }
- return v
-end
-
-# smoothes raw frequencies of tf() in-place
-# a is a smoothing term
-def TFIDF::ntf hash, a=0.4
- max = hash.values.max.to_f
- hash.each_pair { |k,v|
- hash[k] = a + (1-a)*(v/max)
- }
-end
-
-# returns idf value for each word in a vocabulary
-def TFIDF::idf list_of_hashes
- vocab = list_of_hashes.values.flatten.uniq
- n = list_of_hashes.size.to_f
- idf = {}
- vocab.each { |i|
- df = list_of_hashes.values.flatten.count i
- idf[i] = Math.log(n/df)
- }
- return idf
-end
-
-
-end #module
-
diff --git a/lib/zipf.rb b/lib/zipf.rb
new file mode 100755
index 0000000..0e26e97
--- /dev/null
+++ b/lib/zipf.rb
@@ -0,0 +1,18 @@
+#!/usr/bin/env ruby
+
+require 'nlp_ruby/stringutil'
+require 'nlp_ruby/fileutil'
+require 'nlp_ruby/SparseVector'
+require 'nlp_ruby/tfidf'
+require 'nlp_ruby/Translation'
+require 'nlp_ruby/dag'
+require 'nlp_ruby/semirings'
+require 'nlp_ruby/bleu'
+require 'nlp_ruby/misc'
+require 'nlp_ruby/hg'
+require 'nlp_ruby/grammar'
+
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+STDERR.set_encoding 'utf-8'
+
diff --git a/lib/zipf/SparseVector.rb b/lib/zipf/SparseVector.rb
new file mode 100644
index 0000000..3096412
--- /dev/null
+++ b/lib/zipf/SparseVector.rb
@@ -0,0 +1,172 @@
+class SparseVector < Hash
+
+ def initialize arg=nil
+ super
+ self.default = 0
+ if arg.is_a? Array
+ from_a arg
+ end
+ end
+
+ def from_a a
+ a.each_with_index { |i,j| self[j] = i }
+ end
+
+ def self.from_a a
+ v = SparseVector.new
+ v.from_a a
+ return v
+ end
+
+ def from_h h
+ h.each_pair { |k,v| self[k] = v }
+ end
+
+ def self.from_h h
+ v = SparseVector.new
+ v.from_h h
+ return v
+ end
+
+ def from_s s
+ from_h eval(s)
+ end
+
+ def self.from_s s
+ v = SparseVector.new
+ v.from_s s
+ return v
+ end
+
+ def to_kv sep='=', join=' '
+ a = []
+ self.each_pair { |k,v|
+ a << "#{k}#{sep}#{v}"
+ }
+ return a.join join
+ end
+
+ def from_kv s
+ s.split.each { |i|
+ k,v = i.split('=')
+ self[k] = v.to_f
+ }
+ end
+
+ def self.from_kv s
+ v = SparseVector.new
+ v.from_kv s
+ return v
+ end
+
+ def from_file fn, sep='='
+ f = ReadFile.new(fn)
+ while line = f.gets
+ key, value = line.strip.split sep
+ value = value.to_f
+ self[key] = value
+ end
+ end
+
+ def self.from_file fn, sep='='
+ v = SparseVector.new
+ v.from_file fn, sep
+ return v
+ end
+
+ def join_keys other
+ self.keys + other.keys
+ end
+
+ def sum
+ self.values.inject(:+)
+ end
+
+ def approx_eql? other, p=10**-10
+ return false if !other
+ return false if other.size!=self.size
+ return false if other.keys.sort!=self.keys.sort
+ self.keys.each { |k|
+ return false if (self[k]-other[k]).abs>p
+ }
+ return true
+ end
+
+ def average
+ self.sum/self.size.to_f
+ end
+
+ def variance
+ avg = self.average
+ var = 0.0
+ self.values.each { |i| var += (avg - i)**2 }
+ return var
+ end
+
+ def stddev
+ Math.sqrt self.variance
+ end
+
+ def dot other
+ sum = 0.0
+ self.each_pair { |k,v| sum += v * other[k] }
+ return sum
+ end
+
+ def zeros n
+ (0).upto(n-1) { |i| self[i] = 0.0 }
+ end
+
+ def magnitude
+ Math.sqrt self.values.inject { |sum,i| sum+i**2 }
+ end
+
+ def cosinus_sim other
+ self.dot(other)/(self.magnitude*other.magnitude)
+ end
+
+ def euclidian_dist other
+ dims = [self.keys, other.keys].flatten.uniq
+ sum = 0.0
+ dims.each { |d| sum += (self[d] - other[d])**2 }
+ return Math.sqrt(sum)
+ end
+
+ def + other
+ new = SparseVector.new
+ join_keys(other).each { |k|
+ new[k] = self[k]+other[k]
+ }
+ return new
+ end
+
+ def - other
+ new = SparseVector.new
+ join_keys(other).each { |k|
+ new[k] = self[k]-other[k]
+ }
+ return new
+ end
+
+ def * scalar
+ raise ArgumentError, "Arg is not numeric #{scalar}" unless scalar.is_a? Numeric
+ new = SparseVector.new
+ self.keys.each { |k|
+ new[k] = self[k] * scalar
+ }
+ return new
+ end
+
+ def self.mean a
+ mean = SparseVector.new
+ a.each { |i|
+ i.each_pair { |k,v|
+ mean[k] += v
+ }
+ }
+ n = a.size.to_f
+ mean.each_pair { |k,v| mean[k] = v/n }
+ return mean
+ end
+end
+
diff --git a/lib/zipf/Translation.rb b/lib/zipf/Translation.rb
new file mode 100644
index 0000000..3759a1d
--- /dev/null
+++ b/lib/zipf/Translation.rb
@@ -0,0 +1,72 @@
+class Translation
+ attr_accessor :id, :s, :raw, :f, :scores, :rank
+
+ def initialize id=nil, raw=nil, s=nil, f=nil, scores={}, rank=nil
+ @id = id
+ @raw = raw
+ @s = s
+ @f = f
+ @scores = scores
+ @rank = rank
+ end
+
+ def from_s t, strip_alignment=true, rank=nil
+ id, raw, features, score = splitpipe(t, 3)
+ raw.strip!
+ @raw = raw
+ if strip_alignment # the way moses does it
+ @s = @raw.gsub(/\s*\|\d+-\d+\||\|-?\d+\|\s*/, ' ').gsub(/\s+/, ' ')
+ @s.strip!
+ else
+ @s = raw
+ end
+ @id = id.to_i
+ @f = SparseVector.from_kv features
+ @scores[:decoder] = score.to_f
+ @rank = rank
+ end
+
+ def self.from_s s
+ t = self.new
+ t.from_s s
+ return t
+ end
+
+ def to_s include_features=true
+ [@id, @s, @f.to_kv('=', ' '), @scores[:decoder]].join(' ||| ') if include_features
+ [@id, @s, @scores[:decoder]].join(' ||| ') if !include_features
+ end
+
+ def to_s2
+ [@rank, @s, @score, @scores.to_s].join ' ||| '
+ end
+end
+
+def read_kbest_lists fn, translation_type=Translation
+ kbest_lists = []
+ cur = []
+ f = ReadFile.new fn
+ prev = -1
+ c = 0
+ id = 0
+ while line = f.gets
+ t = translation_type.new
+ t.from_s line
+ c = splitpipe(line)[0].to_i
+ if c != prev
+ if cur.size > 0
+ kbest_lists << cur
+ cur = []
+ end
+ prev = c
+ id = 0
+ end
+ t.id = id
+ cur << t
+ id += 1
+ end
+ kbest_lists << cur # last one
+ f.close
+ return kbest_lists
+end
+
diff --git a/lib/zipf/bleu.rb b/lib/zipf/bleu.rb
new file mode 100644
index 0000000..69de00b
--- /dev/null
+++ b/lib/zipf/bleu.rb
@@ -0,0 +1,130 @@
+module BLEU
+
+
+class BLEU::NgramCounts
+ attr_accessor :sum, :clipped, :ref_len, :hyp_len, :n
+
+ def initialize(n)
+ @n = 0
+ @sum = []
+ @clipped = []
+ @ref_len = 0.0
+ @hyp_len = 0.0
+ grow(n)
+ end
+
+ def grow(n)
+ (n-@n).times {
+ @sum << 0.0
+ @clipped << 0.0
+ }
+ @n = n
+ end
+
+ def plus_eq(other)
+ if other.n > @n then grow(other.n) end
+ 0.upto(other.n-1) { |m|
+ @sum[m] += other.sum[m]
+ @clipped[m] += other.clipped[m]
+ }
+ @ref_len += other.ref_len
+ @hyp_len += other.hyp_len
+ end
+
+ def to_s
+ return "n=#{n} sum=#{sum} clipped=#{clipped} ref_len=#{ref_len} hyp_len=#{hyp_len}"
+ end
+end
+
+class BLEU::Ngrams
+ def initialize
+ @h_ = {}
+ @h_.default = 0
+ end
+
+ def add(k)
+ if k.class == Array then k = k.join ' ' end
+ @h_[k] += 1
+ end
+
+ def get_count(k)
+ if k.class == Array then k = k.join ' ' end
+ return @h_[k]
+ end
+
+ def each
+ @h_.each_pair { |k,v|
+ yield k.split, v
+ }
+ end
+
+ def to_s
+ @h_.to_s
+ end
+end
+
+def BLEU::get_counts hypothesis, reference, n, times=1
+ p = NgramCounts.new n
+ r = Ngrams.new
+ ngrams(reference, n) { |ng| r.add ng }
+ h = Ngrams.new
+ ngrams(hypothesis, n) { |ng| h.add ng }
+ h.each { |ng,count|
+ sz = ng.size-1
+ p.sum[sz] += count * times
+ p.clipped[sz] += [r.get_count(ng), count].min * times
+ }
+ p.ref_len = tokenize(reference.strip).size * times
+ p.hyp_len = tokenize(hypothesis.strip).size * times
+ return p
+end
+
+def BLEU::brevity_penalty c, r, smooth=0.0
+ return [0.0, 1.0-((r+smooth)/c)].min
+end
+
+def BLEU::bleu counts, n, debug=false
+ corpus_stats = NgramCounts.new n
+ counts.each { |i| corpus_stats.plus_eq i }
+ logbleu = 0.0
+ 0.upto(n-1) { |m|
+ STDERR.write "#{m+1} #{corpus_stats.clipped[m]} / #{corpus_stats.sum[m]}\n" if debug
+ return 0.0 if corpus_stats.clipped[m] == 0 or corpus_stats.sum == 0
+ logbleu += Math.log(corpus_stats.clipped[m]) - Math.log(corpus_stats.sum[m])
+ }
+ logbleu /= n
+ if debug
+ STDERR.write "BP #{brevity_penalty(corpus_stats.hyp_len, corpus_stats.ref_len)}\n"
+ STDERR.write "sum #{Math.exp(sum)}\n"
+ end
+ logbleu += brevity_penalty corpus_stats.hyp_len, corpus_stats.ref_len
+ return Math.exp logbleu
+end
+
+def BLEU::hbleu counts, n, debug=false
+ (100*bleu(counts, n, debug)).round(3)
+end
+
+def BLEU::per_sentence_bleu hypothesis, reference, n=4, smooth=0.0
+ h_ng = {}; r_ng = {}
+ (1).upto(n) { |i| h_ng[i] = []; r_ng[i] = [] }
+ ngrams(hypothesis, n) { |i| h_ng[i.size] << i }
+ ngrams(reference, n) { |i| r_ng[i.size] << i }
+ m = [n, reference.split.size].min
+ add = 0.0
+ logbleu = 0.0
+ (1).upto(m) { |i|
+ counts_clipped = 0
+ counts_sum = h_ng[i].size
+ h_ng[i].uniq.each { |j| counts_clipped += r_ng[i].count(j) }
+ add = 1.0 if i >= 2
+ logbleu += Math.log(counts_clipped+add) - Math.log(counts_sum+add);
+ }
+ logbleu /= m
+ logbleu += brevity_penalty hypothesis.strip.split.size, reference.strip.split.size, smooth
+ return Math.exp logbleu
+end
+
+
+end #module
+
diff --git a/lib/zipf/dag.rb b/lib/zipf/dag.rb
new file mode 100644
index 0000000..45ede20
--- /dev/null
+++ b/lib/zipf/dag.rb
@@ -0,0 +1,205 @@
+module DAG
+
+require 'json'
+
+
+class DAG::Node
+ attr_accessor :label, :outgoing, :incoming, :score, :mark
+
+ def initialize label=nil, outgoing=[], incoming=[], score=nil
+ @label = label
+ @outgoing = outgoing
+ @incoming = incoming
+ @score = nil
+ end
+
+ def add_edge head, weight=0
+ exit if self==head # no self-cycles!
+ @outgoing << DAG::Edge.new(self, head, weight)
+ return @outgoing.last
+ end
+
+ def to_s
+ "DAG::Node"
+ end
+
+ def repr
+ "#{to_s} #{@score} out:#{@outgoing} in:[#{@incoming.map{|e| e.to_s}.join ', '}]"
+ end
+end
+
+class DAG::Edge
+ attr_accessor :tail, :head, :weight, :mark
+
+ def initialize tail=nil, head=nil, weight=0
+ @tail = tail
+ @head = head
+ @weight = weight
+ @mark = false # did we already follow this edge? -- for topological sorting
+ end
+
+ def to_s
+ s = "DAG::Edge<#{@tail} ->[#{weight}] #{@head}"
+ s += " x" if @mark
+ s += ">"
+ s
+ end
+end
+
+# depth-first search
+# w/o markings as we do not have cycles
+def DAG::dfs n, target_label
+ return n if n.label==target_label # assumes uniq labels!
+ stack = n.outgoing.map { |i| i.head }
+ while !stack.empty?
+ m = stack.pop
+ return DAG::dfs m, target_label
+ end
+ return nil
+end
+
+# breadth-first search
+# w/o markings as we do not have cycles
+def DAG::bfs n, target_label
+ queue = [n]
+ while !queue.empty?
+ m = queue.shift
+ return m if m.label==target_label
+ m.outgoing.each { |e| queue << e.head }
+ end
+ return nil
+end
+
+# topological sort
+def DAG::topological_sort graph
+ sorted = []
+ s = graph.reject { |n| !n.incoming.empty? }
+ while !s.empty?
+ sorted << s.shift
+ sorted.last.outgoing.each { |e|
+ e.mark = true
+ s << e.head if e.head.incoming.reject{|f| f.mark}.empty?
+ }
+ end
+ return sorted
+end
+
+# initialize graph scores with semiring One
+def DAG::init graph, semiring, source_node
+ graph.each {|n| n.score=semiring.null}
+ source_node.score = semiring.one
+end
+
+# viterbi
+def DAG::viterbi graph, semiring=ViterbiSemiring, source_node
+ toposorted = DAG::topological_sort(graph)
+ DAG::init(graph, semiring, source_node)
+ toposorted.each { |n|
+ n.incoming.each { |e|
+ # update
+ n.score = \
+ semiring.add.call(n.score, \
+ semiring.multiply.call(e.tail.score, e.weight)
+ )
+ }
+ }
+end
+
+# forward viterbi
+def DAG::viterbi_forward graph, semiring=ViterbiSemiring, source_node
+ toposorted = DAG::topological_sort(graph)
+ DAG::init(graph, semiring, source_node)
+ toposorted.each { |n|
+ n.outgoing.each { |e|
+ e.head.score = \
+ semiring.add.call(e.head.score, \
+ semiring.multiply.call(n.score, e.weight)
+ )
+ }
+ }
+end
+
+# Dijkstra algorithm
+# for A*-search we would need an optimistic estimate of
+# future cost at each node
+def DAG::dijkstra graph, semiring=RealSemiring.new, source_node
+ DAG::init(graph, semiring, source_node)
+ q = PriorityQueue.new graph
+ while !q.empty?
+ n = q.pop
+ n.outgoing.each { |e|
+ e.head.score = \
+ semiring.add.call(e.head.score, \
+ semiring.multiply.call(n.score, e.weight))
+ q.sort!
+ }
+ end
+end
+
+# Bellman-Ford algorithm
+def DAG::bellman_ford(graph, semiring=RealSemiring.new, source_node)
+ DAG::init(graph, semiring, source_node)
+ edges = []
+ graph.each { |n| edges |= n.outgoing }
+ # relax edges
+ (graph.size-1).times{ |i|
+ edges.each { |e|
+ e.head.score = \
+ semiring.add.call(e.head.score, \
+ semiring.multiply.call(e.tail.score, e.weight))
+ }
+ }
+ # we do not allow cycles (negative or positive)
+end
+
+# Floyd algorithm
+def DAG::floyd(graph, semiring=nil)
+ dist_matrix = []
+ graph.each_index { |i|
+ dist_matrix << []
+ graph.each_index { |j|
+ val = 1.0/0.0
+ val = 0.0 if i==j
+ dist_matrix.last << val
+ }
+ }
+ edges = []
+ graph.each { |n| edges |= n.outgoing }
+ edges.each { |e|
+ dist_matrix[graph.index(e.tail)][graph.index(e.head)] = e.weight
+ }
+ 0.upto(graph.size-1) { |k|
+ 0.upto(graph.size-1) { |i|
+ 0.upto(graph.size-1) { |j|
+ if dist_matrix[i][k] + dist_matrix[k][j] < dist_matrix[i][j]
+ dist_matrix [i][j] = dist_matrix[i][k] + dist_matrix[k][j]
+ end
+ }
+ }
+ }
+ return dist_matrix
+end
+
+
+# returns a list of nodes (graph) and a hash for finding
+# nodes by their label (these need to be unique!)
+def DAG::read_graph_from_json fn, semiring=RealSemiring.new
+ graph = []
+ nodes_by_label = {}
+ h = JSON.parse File.new(fn).read
+ h['nodes'].each { |i|
+ n = DAG::Node.new i['label']
+ graph << n
+ nodes_by_label[n.label] = n
+ }
+ h['edges'].each { |i|
+ n = nodes_by_label[i['tail']]
+ a = n.add_edge(nodes_by_label[i['head']], semiring.convert.call(i['weight'].to_f))
+ nodes_by_label[i['head']].incoming << a
+ }
+ return graph, nodes_by_label
+end
+
+
+end #module
+
diff --git a/lib/zipf/fileutil.rb b/lib/zipf/fileutil.rb
new file mode 100644
index 0000000..eb69136
--- /dev/null
+++ b/lib/zipf/fileutil.rb
@@ -0,0 +1,88 @@
+require 'zlib'
+
+
+class ReadFile
+
+ def initialize fn, encoding='utf-8'
+ if fn.split('.').last == 'gz'
+ @f = Zlib::GzipReader.new(File.new(fn, 'rb'), :external_encoding=>encoding)
+ elsif fn == '-'
+ @f = STDIN
+ STDIN.set_encoding encoding
+ else
+ @f = File.new fn, 'r'
+ @f.set_encoding encoding
+ end
+ end
+
+ def gets
+ @f.gets { |line| yield line }
+ end
+
+ def readlines
+ @f.readlines
+ end
+
+ def self.readlines fn, encoding='utf-8'
+ f = ReadFile.new fn, encoding
+ r = f.readlines
+ f.close
+ return r
+ end
+
+ def readlines_strip
+ self.readlines.map{ |i| i.strip }
+ end
+
+ def self.readlines_strip fn, encoding='utf-8'
+ f = ReadFile.new fn, encoding
+ r = f.readlines_strip
+ f.close
+ return r
+ end
+
+ def read
+ @f.read
+ end
+
+ def self.read fn, encoding='utf-8'
+ f = ReadFile.new fn, encoding
+ r = f.read
+ f.close
+ return r
+ end
+
+ def close
+ @f.close if @f!=STDIN
+ end
+end
+
+class WriteFile
+
+ def initialize fn, encoding='utf-8'
+ if fn.split('.').last == 'gz'
+ @f = Zlib::GzipWriter.new(File.new(fn, 'wb+'), :external_encoding=>encoding)
+ elsif fn == '-'
+ @f = STDOUT
+ STDOUT.set_encoding encoding
+ else
+ @f = File.new fn, 'w+'
+ @f.set_encoding encoding
+ end
+ end
+
+ def write s
+ @f.write s
+ end
+
+ def self.write s, fn, encoding='utf-8'
+ f = WriteFile.new fn, encoding
+ f.write s
+ f.close
+ end
+
+ def close
+ @f.close if @f!=STDIN
+ end
+end
+
diff --git a/lib/zipf/grammar.rb b/lib/zipf/grammar.rb
new file mode 100644
index 0000000..568b9fc
--- /dev/null
+++ b/lib/zipf/grammar.rb
@@ -0,0 +1,123 @@
+module Grammar
+
+
+class T
+ attr_accessor :word
+
+ def initialize word
+ @word = word
+ end
+
+ def to_s
+ "T<#{@word}>"
+ end
+end
+
+class NT
+ attr_accessor :symbol, :index, :span
+
+ def initialize symbol, index=0
+ @symbol = symbol
+ @index = index
+ @span = Span.new
+ end
+
+ def to_s
+ "NT(#{@span.left},#{@span.right})<#{@symbol},#{@index}>"
+ end
+end
+
+class Rule
+ attr_accessor :lhs, :rhs, :e
+
+ def initialize lhs=nil, rhs=[], e=''
+ @lhs = lhs
+ @rhs = rhs
+ @e = e
+ end
+
+ def to_s
+ "#{lhs} -> #{rhs.map{ |i| i.to_s }.join ' '} [arity=#{arity}] ||| #{@e}"
+ end
+
+ def arity
+ rhs.select { |i| i.class == NT }.size
+ end
+
+ def from_s s
+ _ = splitpipe s, 3
+ @lhs = NT.new _[0].strip.gsub!(/(\[|\])/, "")
+ _[1].split.each { |x|
+ x.strip!
+ if x[0]=='[' && x[x.size-1] == ']'
+ @rhs << NT.new(x.gsub!(/(\[|\])/, "").split(',')[0])
+ else
+ @rhs << T.new(x)
+ end
+ }
+ @e = _[2]
+ end
+
+ def self.from_s s
+ r = self.new
+ r.from_s s
+ return r
+ end
+end
+
+class Span
+ attr_accessor :left, :right
+
+ def initialize left=nil, right=nil
+ @left = left
+ @right = right
+ end
+end
+
+class Grammar
+ attr_accessor :rules, :startn, :startt, :flat
+
+ def initialize fn
+ @rules = []; @startn = []; @startt = [] ;@flat = []
+ ReadFile.readlines_strip(fn).each_with_index { |s,i|
+ STDERR.write '.'; STDERR.write " #{i+1}\n" if (i+1)%80==0
+ @rules << Rule.from_s(s)
+ if @rules.last.rhs.first.class == NT
+ @startn << @rules.last
+ else
+ if rules.last.arity == 0
+ @flat << @rules.last
+ else
+ @startt << @rules.last
+ end
+ end
+ }
+ STDERR.write "\n"
+ end
+
+ def to_s
+ s = ''
+ @rules.each { |r| s += r.to_s+"\n" }
+ return s
+ end
+
+ def add_glue_rules
+ @rules.map { |r| r.lhs.symbol }.select { |s| s != 'S' }.uniq.each { |symbol|
+ @rules << Rule.new(NT.new('S'), [NT.new(symbol)])
+ @startn << @rules.last
+ @rules << Rule.new(NT.new('S'), [NT.new('S'), NT.new('X')])
+ @startn << @rules.last
+ }
+ end
+
+ def add_pass_through_rules s
+ s.each { |word|
+ @rules << Rule.new(NT.new('X'), [T.new(word)])
+ @flat << @rules.last
+ }
+ end
+end
+
+
+end #module
+
diff --git a/lib/zipf/hg.rb b/lib/zipf/hg.rb
new file mode 100644
index 0000000..f86bf60
--- /dev/null
+++ b/lib/zipf/hg.rb
@@ -0,0 +1,173 @@
+#!/usr/bin/env ruby
+
+require_relative 'semirings'
+require 'json'
+
+
+module HG
+
+
+class HG::Node
+ attr_accessor :label, :cat, :outgoing, :incoming, :score
+
+ def initialize label=nil, cat=nil, outgoing=[], incoming=[], score=nil
+ @label = label
+ @cat = cat
+ @outgoing = outgoing
+ @incoming = incoming
+ @score = nil
+ end
+
+ def to_s
+ "Node"
+ end
+end
+
+class HG::Hypergraph
+ attr_accessor :nodes, :edges
+
+ def initialize nodes=[], edges=[]
+ @nodes = nodes
+ @edges = edges
+ end
+
+ def arity
+ @edges.map { |e| e.arity }.max
+ end
+
+ def to_s
+ "Hypergraph"
+ end
+end
+
+class HG::Hyperedge
+ attr_accessor :head, :tails, :weight, :f, :mark, :rule, :left, :right
+
+ def initialize head=nil, tails=[], weight=0.0, f={}
+ @head = head
+ @tails = tails
+ @weight = weight
+ @f = f
+ @mark = 0
+ end
+
+ def arity
+ return @tails.size
+ end
+
+ def marked?
+ arity == @mark
+ end
+
+ def to_s
+ "Hyperedge"
+ end
+end
+
+def HG::topological_sort nodes
+ sorted = []
+ s = nodes.reject { |n| !n.incoming.empty? }
+ while !s.empty?
+ sorted << s.shift
+ sorted.last.outgoing.each { |e|
+ next if e.marked?
+ e.mark += 1
+ s << e.head if e.head.incoming.reject{ |f| f.mark==f.arity }.empty?
+ }
+ end
+ return sorted
+end
+
+def HG::init nodes, semiring, root
+ nodes.each { |n| n.score=semiring.null }
+ root.score = semiring.one
+end
+
+def HG::viterbi hypergraph, root, semiring=ViterbiSemiring.new
+ toposorted = topological_sort hypergraph.nodes
+ init toposorted, semiring, root
+ toposorted.each { |n|
+ n.incoming.each { |e|
+ s = semiring.one
+ e.tails.each { |m|
+ s = semiring.multiply.call(s, m.score)
+ }
+ n.score = semiring.add.call(n.score, semiring.multiply.call(s, e.weight))
+ }
+ }
+end
+
+def HG::viterbi_path hypergraph, root, semiring=ViterbiSemiring.new
+ toposorted = topological_sort hypergraph.nodes
+ init toposorted, semiring, root
+ best_path = []
+ toposorted.each { |n|
+ best_edge = nil
+ n.incoming.each { |e|
+ s = semiring.one
+ e.tails.each { |m|
+ s = semiring.multiply.call(s, m.score)
+ }
+ if n.score < semiring.multiply.call(s, e.weight) # ViterbiSemiring add
+ best_edge = e
+ end
+ n.score = semiring.add.call(n.score, semiring.multiply.call(s, e.weight))
+ }
+ best_path << best_edge
+ }
+ return best_path, toposorted.last.score
+end
+
+def HG::read_hypergraph_from_json fn, semiring=RealSemiring.new, log_weights=false
+ nodes = []
+ edges = []
+ nodes_by_label = {}
+ nodes_by_index = []
+ h = JSON.parse File.new(fn).read
+ w = SparseVector.from_h h['weights']
+ h['nodes'].each { |i|
+ n = Node.new i['label'], i['cat']
+ nodes << n
+ nodes_by_label[n.label] = n
+ nodes_by_index << n
+ }
+ h['edges'].each { |i|
+ e = Hyperedge.new(nodes_by_label[i['head']], \
+ i['tails'].map{|j| nodes_by_label[j]}.to_a, \
+ semiring.convert.call(i['weight'].to_f), \
+ {})
+ e.f = SparseVector.from_h i['f']
+ if log_weights
+ e.weight = Math.exp(w.dot(e.f))
+ else
+ e.weight = w.dot(e.f)
+ end
+ e.tails.each { |m|
+ m.outgoing << e
+ }
+ e.head.incoming << e
+ edges << e
+ }
+ return Hypergraph.new(nodes, edges), nodes_by_label, nodes_by_index
+end
+
+def HG::all_paths hypergraph, root, semiring=ViterbiSemiring.new
+ toposorted = topological_sort hypergraph.nodes
+ paths = [[]]
+ toposorted.each { |n|
+ next if n.incoming.empty?
+ new_paths = []
+ while !paths.empty?
+ p = paths.pop
+ n.incoming.each { |e|
+ new_paths << p+[e]
+ }
+ end
+ paths = new_paths
+ }
+ return paths
+end
+
+
+end #module
+
diff --git a/lib/zipf/misc.rb b/lib/zipf/misc.rb
new file mode 100644
index 0000000..0319a5f
--- /dev/null
+++ b/lib/zipf/misc.rb
@@ -0,0 +1,114 @@
+require 'timeout'
+
+
+class Array
+ def max_index
+ self.index(self.max)
+ end
+
+ def is_subset_of? other
+ self.each { |i|
+ if other.include? i
+ return false
+ end
+ }
+ return true
+ end
+
+ def sum
+ self.inject(:+)
+ end
+
+ def mean
+ self.sum.to_f/self.size
+ end
+end
+
+class String
+
+ def downcase?
+ self[/[[:lower:]]/]
+ end
+end
+
+class PriorityQueue
+# This assumes that elements in the queue
+# have a numerical member named 'score'.
+
+ def initialize a=Array.new
+ @queue = Array.new a
+ sort!
+ end
+
+ def sort!
+ @queue.sort_by! { |i| -i.score }
+ end
+
+ def pop
+ @queue.pop
+ end
+
+ def push i
+ @queue << i
+ sort!
+ end
+
+ def empty?
+ @queue.empty?
+ end
+end
+
+def spawn_with_timeout cmd, t=4, ignore_fail=false, debug=false
+ STDERR.write cmd+"\n" if debug
+ pipe_in, pipe_out = IO.pipe
+ pid = Process.spawn(cmd, :out => pipe_out)
+ begin
+ Timeout.timeout(t) { Process.wait pid }
+ rescue Timeout::Error
+ Process.kill('TERM', pid) if !ignore_fail
+ end
+ pipe_out.close
+ return pipe_in.read
+end
+
+def read_phrase_table fn
+ table = {}
+ f = ReadFile.new fn
+ while raw_rule = f.gets
+ french, english, features = splitpipe(raw_rule)
+ feature_map = SparseVector.from_kv features
+ if table.has_key? french
+ table[french] << [english, feature_map ]
+ else
+ table[french] = [[english, feature_map]]
+ end
+ end
+ f.close
+ return table
+end
+
+def cdec_kbest cdec_bin, input, ini, weights, k, unique=true
+ require 'open3'
+ cmd = "echo \"#{input}\" | #{cdec_bin} -c #{ini} -w #{weights} -k #{k}"
+ cmd += " -r" if unique
+ o,_ = Open3.capture2 "#{cmd} 2>/dev/null"
+ a = []; j = -1
+ o.split("\n").map{ |i| j+=1; t=Translation.new; t.from_s(i, false, j); a << t }
+ return a
+end
+
+def read_config fn
+ f = ReadFile.new fn
+ cfg = {}
+ while line = f.gets
+ line.strip!
+ next if /^\s*$/.match line
+ next if line[0]=='#'
+ content = line.split('#', 2).first
+ k, v = content.split(/\s*=\s*/, 2)
+ k.strip!; v.strip!
+ cfg[k] = v
+ end
+ return cfg
+end
+
diff --git a/lib/zipf/semirings.rb b/lib/zipf/semirings.rb
new file mode 100644
index 0000000..920923f
--- /dev/null
+++ b/lib/zipf/semirings.rb
@@ -0,0 +1,81 @@
+# Semirings for directed acyclic graphs (dags) (also directed hypergraphs),
+# as described in:
+# 'Dynamic Programming Algorithms in
+# Semiring and Hypergraph Frameworks' (Liang Huang)
+#
+
+class Semiring
+ attr_accessor :add, :multiply, :one, :null, :convert
+end
+
+class BooleanSemiring < Semiring
+ def initialize
+ @add = Proc.new { |a,b| a||b }
+ @multiply = Proc.new { |a,b| a&&b }
+ @one = true
+ @null = false
+ @convert = Proc.new { |v| true && v!=0 }
+ end
+end
+
+class ViterbiSemiring < Semiring
+ def initialize
+ @add = Proc.new { |a,b| [a,b].max }
+ @multiply = Proc.new { |a,b| a*b }
+ @one = 1.0
+ @null = 0.0
+ @convert = Proc.new { |v| v }
+ end
+end
+
+class ViterbiLogSemiring < Semiring
+ def initialize
+ @add = Proc.new { |a,b| [a,b].max }
+ @multiply = Proc.new { |a,b| a+b }
+ @one = 0.0
+ @null = -1.0/0.0
+ @convert = Proc.new { |v| v }
+ end
+end
+
+class InsideSemiring < Semiring
+ def initialize
+ @add = Proc.new { |a,b| a+b }
+ @multiply = Proc.new { |a,b| a*b }
+ @one = 1.0
+ @null = 0.0
+ @convert = Proc.new { |v| v }
+ end
+end
+
+class RealSemiring < Semiring
+ def initialize
+ @add = Proc.new { |a,b| [a,b].min }
+ @multiply = Proc.new { |a,b| a+b }
+ @one = 0.0
+ @null = 1.0/0.0
+ @convert = Proc.new { |v| v }
+ end
+end
+
+# for longest/worst paths
+class RealxSemiring < Semiring
+ def initialize
+ @add = Proc.new { |a,b| [a,b].max }
+ @multiply = Proc.new { |a,b| a+b }
+ @one = -1.0/0.0
+ @null = 0.0
+ @convert = Proc.new { |v| v }
+ end
+end
+
+class CountingSemiring < Semiring
+ def initialize
+ @add = Proc.new { |a,b| a+b }
+ @multiply = Proc.new { |a,b| a*b }
+ @one = 1.0
+ @null = 0.0
+ @convert = Proc.new { |v| if v!=0 then 1 else 0 end }
+ end
+end
+
diff --git a/lib/zipf/stringutil.rb b/lib/zipf/stringutil.rb
new file mode 100644
index 0000000..aa9be00
--- /dev/null
+++ b/lib/zipf/stringutil.rb
@@ -0,0 +1,22 @@
+def tokenize s
+ s.strip.split
+end
+
+def ngrams(s, n, fix=false)
+ a = tokenize s
+ a.each_with_index { |tok, i|
+ tok.strip!
+ 0.upto([n-1, a.size-i-1].min) { |m|
+ yield a[i..i+m] if !fix||(fix&&a[i..i+m].size==n)
+ }
+ }
+end
+
+def bag_of_words s, stopwords=[]
+ s.strip.split.uniq.sort.reject{ |w| stopwords.include? w }
+end
+
+def splitpipe s, n=3
+ s.strip.split("|"*n)
+end
+
diff --git a/lib/zipf/tfidf.rb b/lib/zipf/tfidf.rb
new file mode 100644
index 0000000..13a40a3
--- /dev/null
+++ b/lib/zipf/tfidf.rb
@@ -0,0 +1,38 @@
+module TFIDF
+
+
+# returns key='raw frequency' for an
+# array-like object
+def TFIDF::tf array, stopwords=[]
+ v = {}; v.default = 0
+ array.uniq.each { |i|
+ next if stopwords.include? i
+ v[i] = array.count(i).to_f
+ }
+ return v
+end
+
+# smoothes raw frequencies of tf() in-place
+# a is a smoothing term
+def TFIDF::ntf hash, a=0.4
+ max = hash.values.max.to_f
+ hash.each_pair { |k,v|
+ hash[k] = a + (1-a)*(v/max)
+ }
+end
+
+# returns idf value for each word in a vocabulary
+def TFIDF::idf list_of_hashes
+ vocab = list_of_hashes.values.flatten.uniq
+ n = list_of_hashes.size.to_f
+ idf = {}
+ vocab.each { |i|
+ df = list_of_hashes.values.flatten.count i
+ idf[i] = Math.log(n/df)
+ }
+ return idf
+end
+
+
+end #module
+
--
cgit v1.2.3