summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2014-01-29 19:22:56 +0100
committerPatrick Simianer <p@simianer.de>2014-01-29 19:22:56 +0100
commitd9d72e06db07087aa54401fae8b259f0c4ccd649 (patch)
tree97f0444314c40d2894ac0892d5559101eda01acf /lib
parent22644ed1365e566c8bf806bfff4ecd43c46ce089 (diff)
first usable version, name change => nlp_ruby
Diffstat (limited to 'lib')
-rwxr-xr-xlib/nlp_ruby.rb16
-rw-r--r--lib/nlp_ruby/PriorityQueue.rb36
-rw-r--r--lib/nlp_ruby/SparseVector.rb64
-rw-r--r--lib/nlp_ruby/Vector.rb43
-rw-r--r--lib/nlp_ruby/dags.rb218
-rw-r--r--lib/nlp_ruby/fileutil.rb61
-rw-r--r--lib/nlp_ruby/semirings.rb68
-rw-r--r--lib/nlp_ruby/stringutil.rb34
-rw-r--r--lib/nlp_ruby/tfidf.rb32
-rw-r--r--lib/nlp_ruby/ttable.rb17
-rw-r--r--lib/rubynlp.rb0
11 files changed, 589 insertions, 0 deletions
diff --git a/lib/nlp_ruby.rb b/lib/nlp_ruby.rb
new file mode 100755
index 0000000..4981bb0
--- /dev/null
+++ b/lib/nlp_ruby.rb
@@ -0,0 +1,16 @@
+#!/usr/bin/env ruby
+
+require 'rubynlp/stringutil'
+require 'rubynlp/fileutil'
+require 'rubynlp/Vector'
+require 'rubynlp/SparseVector'
+require 'rubynlp/PriorityQueue'
+require 'rubynlp/tfidf'
+require 'rubynlp/ttable'
+require 'rubynlp/dags'
+require 'rubynlp/semirings'
+
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+STDERR.set_encoding 'utf-8'
+
diff --git a/lib/nlp_ruby/PriorityQueue.rb b/lib/nlp_ruby/PriorityQueue.rb
new file mode 100644
index 0000000..22662b3
--- /dev/null
+++ b/lib/nlp_ruby/PriorityQueue.rb
@@ -0,0 +1,36 @@
+# FIXME dags
+# this assumes that elements in the queue
+# have a numerical member named 'score'
+class PriorityQueue
+
+ def initialize a=Array.new
+ @queue = Array.new a
+ end
+
+ def sort!
+ @queue.sort_by! { |i| -i.score }
+ end
+
+ def pop
+ sort!
+ @queue.pop
+ end
+
+ def push i
+ @queue << i
+ end
+
+ def empty?
+ @queue.empty?
+ end
+
+ # FIXME
+ def to_s
+ a = []
+ @queue.each { |i|
+ a << "#{i.to_s}[#{i.score}]"
+ }
+ "[#{a.join ', '}]"
+ end
+end
+
diff --git a/lib/nlp_ruby/SparseVector.rb b/lib/nlp_ruby/SparseVector.rb
new file mode 100644
index 0000000..0033690
--- /dev/null
+++ b/lib/nlp_ruby/SparseVector.rb
@@ -0,0 +1,64 @@
+class SparseVector < Hash
+
+ def initialize
+ super
+ self.default = 0
+ end
+
+ def from_hash h
+ h.each_pair { |k,v| self[k] = v }
+ end
+
+ def sum
+ self.values.inject(:+)
+ end
+
+ def average
+ self.sum/self.size.to_f
+ end
+
+ def variance
+ avg = self.average
+ var = 0.0
+ self.values.each { |i| var += (avg - i)**2 }
+ return var
+ end
+
+ def stddev
+ Math.sqrt self.variance
+ end
+
+ def dot other
+ sum = 0.0
+ self.each_pair { |k,v| sum += v * other[k] }
+ return sum
+ end
+
+ def magnitude
+ Math.sqrt self.values.inject { |sum,i| sum+i**2 }
+ end
+
+ def cosinus_sim other
+ self.dot(other)/(self.magnitude*other.magnitude)
+ end
+
+ def euclidian_dist other
+ dims = [self.keys, other.keys].flatten.uniq
+ sum = 0.0
+ dims.each { |d| sum += (self[d] - other[d])**2 }
+ return Math.sqrt(sum)
+ end
+end
+
+def mean_sparse_vector array_of_vectors
+ mean = SparseVector.new
+ array_of_vectors.each { |i|
+ i.each_pair { |k,v|
+ mean[k] += v
+ }
+ }
+ n = array_of_vectors.size.to_f
+ mean.each_pair { |k,v| mean[k] = v/n }
+ return mean
+end
+
diff --git a/lib/nlp_ruby/Vector.rb b/lib/nlp_ruby/Vector.rb
new file mode 100644
index 0000000..c749ed9
--- /dev/null
+++ b/lib/nlp_ruby/Vector.rb
@@ -0,0 +1,43 @@
+class Vector < Array
+
+ def sum
+ self.inject(:+)
+ end
+
+ def average
+ self.sum/self.size.to_f
+ end
+
+ def variance
+ avg = self.average
+ var = 0.0
+ self.each { |i| var += (avg - i)**2 }
+ var
+ end
+
+ def stddev list_of_values, decimal_places=-1
+ Math.sqrt self.variance
+ end
+
+ def dot other
+ return nil if self.size!=other.size
+ sum = 0.0
+ self.each_with_index { |i,j| sum += i*other[j] }
+ return sum
+ end
+
+ def magnitude
+ Math.sqrt self.inject { |sum,i| sum+i**2 }
+ end
+
+ def cosinus_sim other
+ return self.dot(other)/(self.mag*other.mag)
+ end
+
+ def euclidian_dist other
+ sum = 0.0
+ self.each_with_index { |i,j| sum += (i - other[j])**2 }
+ return Math.sqrt(sum)
+ end
+end
+
diff --git a/lib/nlp_ruby/dags.rb b/lib/nlp_ruby/dags.rb
new file mode 100644
index 0000000..7767be1
--- /dev/null
+++ b/lib/nlp_ruby/dags.rb
@@ -0,0 +1,218 @@
+###########################
+# TODO
+# output paths
+# visualization?
+# algorithms:
+# beam search
+# best-first
+# kbest
+# kruskal (MST)?
+# transitive closure?
+###########################
+
+require 'json'
+
+
+module DAG
+
+
+class DAG::Node
+ attr_accessor :label, :edges, :incoming, :score, :mark
+
+ def initialize label=nil, edges=[], incoming=[], score=nil
+ @label = label
+ @edges = edges # outgoing
+ @incoming = incoming
+ @score = nil
+ end
+
+ def add_edge head, weight=0
+ exit if self==head # no self-cycles!
+ @edges << DAG::Edge.new(self, head, weight)
+ return @edges.last
+ end
+
+ def to_s
+ "DAG::Node<label:#{label}, edges:#{edges.size}, incoming:#{incoming.size}>"
+ end
+
+ def repr
+ "#{to_s} #{@score} out:#{@edges} in:[#{@incoming.map{|e| e.to_s}.join ', '}]"
+ end
+end
+
+class DAG::Edge
+ attr_accessor :tail, :head, :weight, :mark
+
+ def initialize tail=nil, head=nil, weight=0
+ @tail = tail
+ @head = head
+ @weight = weight
+ @mark = false # did we already follow this edge? -- for topological sorting
+ end
+
+ def to_s
+ s = "DAG::Edge<#{@tail} ->[#{weight}] #{@head}"
+ s += " x" if @mark
+ s += ">"
+ s
+ end
+end
+
+# depth-first search
+# w/o markings as we do not have cycles
+def DAG::dfs n, target_label
+ return n if n.label==target_label # assumes uniq labels!
+ stack = n.edges.map { |i| i.head }
+ while !stack.empty?
+ m = stack.pop
+ return DAG::dfs m, target_label
+ end
+ return nil
+end
+
+# breadth-first search
+# w/o markings as we do not have cycles
+def DAG::bfs n, target_label
+ queue = [n]
+ while !queue.empty?
+ m = queue.shift
+ return m if m.label==target_label
+ m.edges.each { |e| queue << e.head }
+ end
+ return nil
+end
+
+# topological sort
+def DAG::topological_sort graph
+ sorted = []
+ s = graph.reject { |n| !n.incoming.empty? }
+ while !s.empty?
+ sorted << s.shift
+ sorted.last.edges.each { |e|
+ e.mark = true
+ s << e.head if e.head.incoming.reject{|f| f.mark}.empty?
+ }
+ end
+ return sorted
+end
+
+# initialize graph scores with semiring One
+def DAG::init graph, semiring, source_node
+ graph.each {|n| n.score=semiring.null}
+ source_node.score = semiring.one
+end
+
+# viterbi
+def DAG::viterbi graph, semiring=ViterbiSemiring, source_node
+ toposorted = DAG::topological_sort(graph)
+ DAG::init(graph, semiring, source_node)
+ toposorted.each { |n|
+ n.incoming.each { |e|
+ # update
+ n.score = \
+ semiring.add.call(n.score, \
+ semiring.multiply.call(e.tail.score, e.weight)
+ )
+ }
+ }
+end
+
+# forward viterbi
+def DAG::viterbi_forward graph, semiring=ViterbiSemiring, source_node
+ toposorted = DAG::topological_sort(graph)
+ DAG::init(graph, semiring, source_node)
+ toposorted.each { |n|
+ n.edges.each { |e|
+ e.head.score = \
+ semiring.add.call(e.head.score, \
+ semiring.multiply.call(n.score, e.weight)
+ )
+ }
+ }
+end
+
+# Dijkstra algorithm
+# for A*-search we would need an optimistic estimate of
+# future cost at each node
+def DAG::dijkstra graph, semiring=RealSemiring.new, source_node
+ DAG::init(graph, semiring, source_node)
+ q = PriorityQueue.new graph
+ while !q.empty?
+ n = q.pop
+ n.edges.each { |e|
+ e.head.score = \
+ semiring.add.call(e.head.score, \
+ semiring.multiply.call(n.score, e.weight))
+ q.sort!
+ }
+ end
+end
+
+# Bellman-Ford algorithm
+def DAG::bellman_ford(graph, semiring=RealSemiring.new, source_node)
+ DAG::init(graph, semiring, source_node)
+ edges = []
+ graph.each { |n| edges |= n.edges }
+ # relax edges
+ (graph.size-1).times{ |i|
+ edges.each { |e|
+ e.head.score = \
+ semiring.add.call(e.head.score, \
+ semiring.multiply.call(e.tail.score, e.weight))
+ }
+ }
+ # we do not allow cycles (negative or positive)
+end
+
+# Floyd algorithm
+def DAG::floyd(graph, semiring=nil)
+ dist_matrix = []
+ graph.each_index { |i|
+ dist_matrix << []
+ graph.each_index { |j|
+ val = 1.0/0.0
+ val = 0.0 if i==j
+ dist_matrix.last << val
+ }
+ }
+ edges = []
+ graph.each { |n| edges |= n.edges }
+ edges.each { |e|
+ dist_matrix[graph.index(e.tail)][graph.index(e.head)] = e.weight
+ }
+ 0.upto(graph.size-1) { |k|
+ 0.upto(graph.size-1) { |i|
+ 0.upto(graph.size-1) { |j|
+ if dist_matrix[i][k] + dist_matrix[k][j] < dist_matrix[i][j]
+ dist_matrix [i][j] = dist_matrix[i][k] + dist_matrix[k][j]
+ end
+ }
+ }
+ }
+ return dist_matrix
+end
+
+
+# returns a list of nodes (graph) and a hash for finding
+# nodes by their label (these need to be unique!)
+def DAG::read_graph_from_json fn, semiring=RealSemiring.new
+ graph = []
+ nodes_by_label = {}
+ h = JSON.parse File.new(fn).read
+ h['nodes'].each { |i|
+ n = DAG::Node.new i['label']
+ graph << n
+ nodes_by_label[n.label] = n
+ }
+ h['edges'].each { |i|
+ n = nodes_by_label[i['tail']]
+ a = n.add_edge(nodes_by_label[i['head']], semiring.convert.call(i['weight'].to_f))
+ nodes_by_label[i['head']].incoming << a
+ }
+ return graph, nodes_by_label
+end
+
+
+end # module
+
diff --git a/lib/nlp_ruby/fileutil.rb b/lib/nlp_ruby/fileutil.rb
new file mode 100644
index 0000000..825ceb4
--- /dev/null
+++ b/lib/nlp_ruby/fileutil.rb
@@ -0,0 +1,61 @@
+require 'zlib'
+
+
+class ReadFile
+
+ def initialize fn, encoding='utf-8'
+ if fn.split('.').last == 'gz'
+ @f = Zlib::GzipReader.new(File.new(fn, 'rb'), :external_encoding=>encoding)
+ elsif fn == '-'
+ @f = STDIN
+ STDIN.set_encoding encoding
+ else
+ @f = File.new fn, 'r'
+ @f.set_encoding encoding
+ end
+ end
+
+ def gets
+ @f.gets { |line| yield line }
+ end
+
+ def readlines
+ @f.readlines
+ end
+
+ def readlines_strip
+ self.readlines.map{ |i| i.strip }
+ end
+
+ def read
+ @f.read
+ end
+
+ def close
+ @f.close if @f!=STDIN
+ end
+end
+
+class WriteFile
+
+ def initialize fn, encoding='utf-8'
+ if fn.split('.').last == 'gz'
+ @f = Zlib::GzipWrite.new(File.new(fn, 'wb+'), :external_encoding=>encoding)
+ elsif fn == '-'
+ @f = STDOUT
+ STDOUT.set_encoding encoding
+ else
+ @f = File.new fn, 'w+'
+ @f.set_encoding encoding
+ end
+ end
+
+ def write s
+ @f.write s
+ end
+
+ def close
+ @f.close if @f!=STDIN
+ end
+end
+
diff --git a/lib/nlp_ruby/semirings.rb b/lib/nlp_ruby/semirings.rb
new file mode 100644
index 0000000..a06f151
--- /dev/null
+++ b/lib/nlp_ruby/semirings.rb
@@ -0,0 +1,68 @@
+# semirings for graphs as described in
+# 'Dynamic Programming Algorithms in
+# Semiring and Hypergraph Frameworks' (Liang Huang)
+class Semiring
+ attr_accessor :add, :multiply, :one, :null, :convert
+end
+
+class BooleanSemiring < Semiring
+ def initialize
+ @add = Proc.new { |a,b| a||b }
+ @multiply = Proc.new { |a,b| a&&b }
+ @one = true
+ @null = false
+ @convert = Proc.new { |v| true && v!=0 }
+ end
+end
+
+class ViterbiSemiring < Semiring
+ def initialize
+ @add = Proc.new { |a,b| [a,b].max }
+ @multiply = Proc.new { |a,b| a*b }
+ @one = 1.0
+ @null = 0.0
+ @convert = Proc.new { |v| v }
+ end
+end
+
+class InsideSemiring < Semiring
+ def initialize
+ @add = Proc.new { |a,b| a+b }
+ @multiply = Proc.new { |a,b| a*b }
+ @one = 1.0
+ @null = 0.0
+ @convert = Proc.new { |v| v }
+ end
+end
+
+class RealSemiring < Semiring
+ def initialize
+ @add = Proc.new { |a,b| [a,b].min }
+ @multiply = Proc.new { |a,b| a+b }
+ @one = 0.0
+ @null = 1.0/0.0
+ @convert = Proc.new { |v| v }
+ end
+end
+
+# for longest/worst paths
+class RealxSemiring < Semiring
+ def initialize
+ @add = Proc.new { |a,b| [a,b].max }
+ @multiply = Proc.new { |a,b| a+b }
+ @one = -1.0/0.0
+ @null = 0.0
+ @convert = Proc.new { |v| v }
+ end
+end
+
+class CountingSemiring < Semiring
+ def initialize
+ @add = Proc.new { |a,b| a+b }
+ @multiply = Proc.new { |a,b| a*b }
+ @one = 1.0
+ @null = 0.0
+ @convert = Proc.new { |v| if v!=0 then 1 else 0 end }
+ end
+end
+
diff --git a/lib/nlp_ruby/stringutil.rb b/lib/nlp_ruby/stringutil.rb
new file mode 100644
index 0000000..e9a3bc9
--- /dev/null
+++ b/lib/nlp_ruby/stringutil.rb
@@ -0,0 +1,34 @@
+# whitespace 'tokenizer'
+def tokenize s
+ s.strip.split
+end
+
+def splitpipe s
+ s.strip.split(/\s*\|\|\|\s*/)
+end
+
+def downcase? s
+ s[/[[:lower:]]/]
+end
+
+# iterator over n-grams
+def ngrams(s, n, fix=false)
+ a = tokenize s
+ a.each_with_index { |tok, i|
+ tok.strip!
+ 0.upto([n-1, a.size-i-1].min) { |m|
+ yield a[i..i+m] if !fix||(fix&&a[i..i+m].size==n)
+ }
+ }
+end
+
+# a=1.0 b=2.0 => { 'a' => 1.0, 'b' => 2.0 }
+def read_feature_string s
+ map = SparseVector.new
+ tokenize(s).each { |i|
+ key, value = i.split '='
+ map[key] = value.to_f
+ }
+ return map
+end
+
diff --git a/lib/nlp_ruby/tfidf.rb b/lib/nlp_ruby/tfidf.rb
new file mode 100644
index 0000000..84d55a5
--- /dev/null
+++ b/lib/nlp_ruby/tfidf.rb
@@ -0,0 +1,32 @@
+# returns key='raw frequency' for an
+# array-like object
+def tf array, stopwords=[]
+ v = {}; v.default = 0
+ array.uniq.each { |i|
+ next if stopwords.include? i
+ v[i] = array.count(i).to_f
+ }
+ return v
+end
+
+# smoothes raw frequencies of tf() in-place
+# a is a smoothing term
+def ntf hash, a=0.4
+ max = hash.values.max.to_f
+ hash.each_pair { |k,v|
+ hash[k] = a + (1-a)*(v/max)
+ }
+end
+
+# returns idf value for each word in a vocabulary
+def idf list_of_hashes
+ vocab = list_of_hashes.values.flatten.uniq
+ n = list_of_hashes.size.to_f
+ idf = {}
+ vocab.each { |i|
+ df = list_of_hashes.values.flatten.count i
+ idf[i] = Math.log(n/df)
+ }
+ return idf
+end
+
diff --git a/lib/nlp_ruby/ttable.rb b/lib/nlp_ruby/ttable.rb
new file mode 100644
index 0000000..20b1412
--- /dev/null
+++ b/lib/nlp_ruby/ttable.rb
@@ -0,0 +1,17 @@
+# table['some French string'] = [Array of English strings]
+def read_phrase_table fn
+ table = {}
+ f = ReadFile.new fn
+ while raw_rule = f.gets
+ french, english, features = splitpipe(raw_rule)
+ feature_map = read_feature_string(features)
+ if table.has_key? french
+ table[french] << [english, feature_map ]
+ else
+ table[french] = [[english, feature_map]]
+ end
+ end
+ f.close
+ return table
+end
+
diff --git a/lib/rubynlp.rb b/lib/rubynlp.rb
deleted file mode 100644
index e69de29..0000000
--- a/lib/rubynlp.rb
+++ /dev/null