diff options
author | Patrick Simianer <p@simianer.de> | 2014-01-29 19:22:56 +0100 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2014-01-29 19:22:56 +0100 |
commit | d9d72e06db07087aa54401fae8b259f0c4ccd649 (patch) | |
tree | 97f0444314c40d2894ac0892d5559101eda01acf /lib | |
parent | 22644ed1365e566c8bf806bfff4ecd43c46ce089 (diff) |
first usable version, name change => nlp_ruby
Diffstat (limited to 'lib')
-rwxr-xr-x | lib/nlp_ruby.rb | 16 | ||||
-rw-r--r-- | lib/nlp_ruby/PriorityQueue.rb | 36 | ||||
-rw-r--r-- | lib/nlp_ruby/SparseVector.rb | 64 | ||||
-rw-r--r-- | lib/nlp_ruby/Vector.rb | 43 | ||||
-rw-r--r-- | lib/nlp_ruby/dags.rb | 218 | ||||
-rw-r--r-- | lib/nlp_ruby/fileutil.rb | 61 | ||||
-rw-r--r-- | lib/nlp_ruby/semirings.rb | 68 | ||||
-rw-r--r-- | lib/nlp_ruby/stringutil.rb | 34 | ||||
-rw-r--r-- | lib/nlp_ruby/tfidf.rb | 32 | ||||
-rw-r--r-- | lib/nlp_ruby/ttable.rb | 17 | ||||
-rw-r--r-- | lib/rubynlp.rb | 0 |
11 files changed, 589 insertions, 0 deletions
diff --git a/lib/nlp_ruby.rb b/lib/nlp_ruby.rb new file mode 100755 index 0000000..4981bb0 --- /dev/null +++ b/lib/nlp_ruby.rb @@ -0,0 +1,16 @@ +#!/usr/bin/env ruby + +require 'rubynlp/stringutil' +require 'rubynlp/fileutil' +require 'rubynlp/Vector' +require 'rubynlp/SparseVector' +require 'rubynlp/PriorityQueue' +require 'rubynlp/tfidf' +require 'rubynlp/ttable' +require 'rubynlp/dags' +require 'rubynlp/semirings' + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' +STDERR.set_encoding 'utf-8' + diff --git a/lib/nlp_ruby/PriorityQueue.rb b/lib/nlp_ruby/PriorityQueue.rb new file mode 100644 index 0000000..22662b3 --- /dev/null +++ b/lib/nlp_ruby/PriorityQueue.rb @@ -0,0 +1,36 @@ +# FIXME dags +# this assumes that elements in the queue +# have a numerical member named 'score' +class PriorityQueue + + def initialize a=Array.new + @queue = Array.new a + end + + def sort! + @queue.sort_by! { |i| -i.score } + end + + def pop + sort! + @queue.pop + end + + def push i + @queue << i + end + + def empty? + @queue.empty? + end + + # FIXME + def to_s + a = [] + @queue.each { |i| + a << "#{i.to_s}[#{i.score}]" + } + "[#{a.join ', '}]" + end +end + diff --git a/lib/nlp_ruby/SparseVector.rb b/lib/nlp_ruby/SparseVector.rb new file mode 100644 index 0000000..0033690 --- /dev/null +++ b/lib/nlp_ruby/SparseVector.rb @@ -0,0 +1,64 @@ +class SparseVector < Hash + + def initialize + super + self.default = 0 + end + + def from_hash h + h.each_pair { |k,v| self[k] = v } + end + + def sum + self.values.inject(:+) + end + + def average + self.sum/self.size.to_f + end + + def variance + avg = self.average + var = 0.0 + self.values.each { |i| var += (avg - i)**2 } + return var + end + + def stddev + Math.sqrt self.variance + end + + def dot other + sum = 0.0 + self.each_pair { |k,v| sum += v * other[k] } + return sum + end + + def magnitude + Math.sqrt self.values.inject { |sum,i| sum+i**2 } + end + + def cosinus_sim other + self.dot(other)/(self.magnitude*other.magnitude) + end + + def euclidian_dist other + dims = [self.keys, other.keys].flatten.uniq + sum = 0.0 + dims.each { |d| sum += (self[d] - other[d])**2 } + return Math.sqrt(sum) + end +end + +def mean_sparse_vector array_of_vectors + mean = SparseVector.new + array_of_vectors.each { |i| + i.each_pair { |k,v| + mean[k] += v + } + } + n = array_of_vectors.size.to_f + mean.each_pair { |k,v| mean[k] = v/n } + return mean +end + diff --git a/lib/nlp_ruby/Vector.rb b/lib/nlp_ruby/Vector.rb new file mode 100644 index 0000000..c749ed9 --- /dev/null +++ b/lib/nlp_ruby/Vector.rb @@ -0,0 +1,43 @@ +class Vector < Array + + def sum + self.inject(:+) + end + + def average + self.sum/self.size.to_f + end + + def variance + avg = self.average + var = 0.0 + self.each { |i| var += (avg - i)**2 } + var + end + + def stddev list_of_values, decimal_places=-1 + Math.sqrt self.variance + end + + def dot other + return nil if self.size!=other.size + sum = 0.0 + self.each_with_index { |i,j| sum += i*other[j] } + return sum + end + + def magnitude + Math.sqrt self.inject { |sum,i| sum+i**2 } + end + + def cosinus_sim other + return self.dot(other)/(self.mag*other.mag) + end + + def euclidian_dist other + sum = 0.0 + self.each_with_index { |i,j| sum += (i - other[j])**2 } + return Math.sqrt(sum) + end +end + diff --git a/lib/nlp_ruby/dags.rb b/lib/nlp_ruby/dags.rb new file mode 100644 index 0000000..7767be1 --- /dev/null +++ b/lib/nlp_ruby/dags.rb @@ -0,0 +1,218 @@ +########################### +# TODO +# output paths +# visualization? +# algorithms: +# beam search +# best-first +# kbest +# kruskal (MST)? +# transitive closure? +########################### + +require 'json' + + +module DAG + + +class DAG::Node + attr_accessor :label, :edges, :incoming, :score, :mark + + def initialize label=nil, edges=[], incoming=[], score=nil + @label = label + @edges = edges # outgoing + @incoming = incoming + @score = nil + end + + def add_edge head, weight=0 + exit if self==head # no self-cycles! + @edges << DAG::Edge.new(self, head, weight) + return @edges.last + end + + def to_s + "DAG::Node<label:#{label}, edges:#{edges.size}, incoming:#{incoming.size}>" + end + + def repr + "#{to_s} #{@score} out:#{@edges} in:[#{@incoming.map{|e| e.to_s}.join ', '}]" + end +end + +class DAG::Edge + attr_accessor :tail, :head, :weight, :mark + + def initialize tail=nil, head=nil, weight=0 + @tail = tail + @head = head + @weight = weight + @mark = false # did we already follow this edge? -- for topological sorting + end + + def to_s + s = "DAG::Edge<#{@tail} ->[#{weight}] #{@head}" + s += " x" if @mark + s += ">" + s + end +end + +# depth-first search +# w/o markings as we do not have cycles +def DAG::dfs n, target_label + return n if n.label==target_label # assumes uniq labels! + stack = n.edges.map { |i| i.head } + while !stack.empty? + m = stack.pop + return DAG::dfs m, target_label + end + return nil +end + +# breadth-first search +# w/o markings as we do not have cycles +def DAG::bfs n, target_label + queue = [n] + while !queue.empty? + m = queue.shift + return m if m.label==target_label + m.edges.each { |e| queue << e.head } + end + return nil +end + +# topological sort +def DAG::topological_sort graph + sorted = [] + s = graph.reject { |n| !n.incoming.empty? } + while !s.empty? + sorted << s.shift + sorted.last.edges.each { |e| + e.mark = true + s << e.head if e.head.incoming.reject{|f| f.mark}.empty? + } + end + return sorted +end + +# initialize graph scores with semiring One +def DAG::init graph, semiring, source_node + graph.each {|n| n.score=semiring.null} + source_node.score = semiring.one +end + +# viterbi +def DAG::viterbi graph, semiring=ViterbiSemiring, source_node + toposorted = DAG::topological_sort(graph) + DAG::init(graph, semiring, source_node) + toposorted.each { |n| + n.incoming.each { |e| + # update + n.score = \ + semiring.add.call(n.score, \ + semiring.multiply.call(e.tail.score, e.weight) + ) + } + } +end + +# forward viterbi +def DAG::viterbi_forward graph, semiring=ViterbiSemiring, source_node + toposorted = DAG::topological_sort(graph) + DAG::init(graph, semiring, source_node) + toposorted.each { |n| + n.edges.each { |e| + e.head.score = \ + semiring.add.call(e.head.score, \ + semiring.multiply.call(n.score, e.weight) + ) + } + } +end + +# Dijkstra algorithm +# for A*-search we would need an optimistic estimate of +# future cost at each node +def DAG::dijkstra graph, semiring=RealSemiring.new, source_node + DAG::init(graph, semiring, source_node) + q = PriorityQueue.new graph + while !q.empty? + n = q.pop + n.edges.each { |e| + e.head.score = \ + semiring.add.call(e.head.score, \ + semiring.multiply.call(n.score, e.weight)) + q.sort! + } + end +end + +# Bellman-Ford algorithm +def DAG::bellman_ford(graph, semiring=RealSemiring.new, source_node) + DAG::init(graph, semiring, source_node) + edges = [] + graph.each { |n| edges |= n.edges } + # relax edges + (graph.size-1).times{ |i| + edges.each { |e| + e.head.score = \ + semiring.add.call(e.head.score, \ + semiring.multiply.call(e.tail.score, e.weight)) + } + } + # we do not allow cycles (negative or positive) +end + +# Floyd algorithm +def DAG::floyd(graph, semiring=nil) + dist_matrix = [] + graph.each_index { |i| + dist_matrix << [] + graph.each_index { |j| + val = 1.0/0.0 + val = 0.0 if i==j + dist_matrix.last << val + } + } + edges = [] + graph.each { |n| edges |= n.edges } + edges.each { |e| + dist_matrix[graph.index(e.tail)][graph.index(e.head)] = e.weight + } + 0.upto(graph.size-1) { |k| + 0.upto(graph.size-1) { |i| + 0.upto(graph.size-1) { |j| + if dist_matrix[i][k] + dist_matrix[k][j] < dist_matrix[i][j] + dist_matrix [i][j] = dist_matrix[i][k] + dist_matrix[k][j] + end + } + } + } + return dist_matrix +end + + +# returns a list of nodes (graph) and a hash for finding +# nodes by their label (these need to be unique!) +def DAG::read_graph_from_json fn, semiring=RealSemiring.new + graph = [] + nodes_by_label = {} + h = JSON.parse File.new(fn).read + h['nodes'].each { |i| + n = DAG::Node.new i['label'] + graph << n + nodes_by_label[n.label] = n + } + h['edges'].each { |i| + n = nodes_by_label[i['tail']] + a = n.add_edge(nodes_by_label[i['head']], semiring.convert.call(i['weight'].to_f)) + nodes_by_label[i['head']].incoming << a + } + return graph, nodes_by_label +end + + +end # module + diff --git a/lib/nlp_ruby/fileutil.rb b/lib/nlp_ruby/fileutil.rb new file mode 100644 index 0000000..825ceb4 --- /dev/null +++ b/lib/nlp_ruby/fileutil.rb @@ -0,0 +1,61 @@ +require 'zlib' + + +class ReadFile + + def initialize fn, encoding='utf-8' + if fn.split('.').last == 'gz' + @f = Zlib::GzipReader.new(File.new(fn, 'rb'), :external_encoding=>encoding) + elsif fn == '-' + @f = STDIN + STDIN.set_encoding encoding + else + @f = File.new fn, 'r' + @f.set_encoding encoding + end + end + + def gets + @f.gets { |line| yield line } + end + + def readlines + @f.readlines + end + + def readlines_strip + self.readlines.map{ |i| i.strip } + end + + def read + @f.read + end + + def close + @f.close if @f!=STDIN + end +end + +class WriteFile + + def initialize fn, encoding='utf-8' + if fn.split('.').last == 'gz' + @f = Zlib::GzipWrite.new(File.new(fn, 'wb+'), :external_encoding=>encoding) + elsif fn == '-' + @f = STDOUT + STDOUT.set_encoding encoding + else + @f = File.new fn, 'w+' + @f.set_encoding encoding + end + end + + def write s + @f.write s + end + + def close + @f.close if @f!=STDIN + end +end + diff --git a/lib/nlp_ruby/semirings.rb b/lib/nlp_ruby/semirings.rb new file mode 100644 index 0000000..a06f151 --- /dev/null +++ b/lib/nlp_ruby/semirings.rb @@ -0,0 +1,68 @@ +# semirings for graphs as described in +# 'Dynamic Programming Algorithms in +# Semiring and Hypergraph Frameworks' (Liang Huang) +class Semiring + attr_accessor :add, :multiply, :one, :null, :convert +end + +class BooleanSemiring < Semiring + def initialize + @add = Proc.new { |a,b| a||b } + @multiply = Proc.new { |a,b| a&&b } + @one = true + @null = false + @convert = Proc.new { |v| true && v!=0 } + end +end + +class ViterbiSemiring < Semiring + def initialize + @add = Proc.new { |a,b| [a,b].max } + @multiply = Proc.new { |a,b| a*b } + @one = 1.0 + @null = 0.0 + @convert = Proc.new { |v| v } + end +end + +class InsideSemiring < Semiring + def initialize + @add = Proc.new { |a,b| a+b } + @multiply = Proc.new { |a,b| a*b } + @one = 1.0 + @null = 0.0 + @convert = Proc.new { |v| v } + end +end + +class RealSemiring < Semiring + def initialize + @add = Proc.new { |a,b| [a,b].min } + @multiply = Proc.new { |a,b| a+b } + @one = 0.0 + @null = 1.0/0.0 + @convert = Proc.new { |v| v } + end +end + +# for longest/worst paths +class RealxSemiring < Semiring + def initialize + @add = Proc.new { |a,b| [a,b].max } + @multiply = Proc.new { |a,b| a+b } + @one = -1.0/0.0 + @null = 0.0 + @convert = Proc.new { |v| v } + end +end + +class CountingSemiring < Semiring + def initialize + @add = Proc.new { |a,b| a+b } + @multiply = Proc.new { |a,b| a*b } + @one = 1.0 + @null = 0.0 + @convert = Proc.new { |v| if v!=0 then 1 else 0 end } + end +end + diff --git a/lib/nlp_ruby/stringutil.rb b/lib/nlp_ruby/stringutil.rb new file mode 100644 index 0000000..e9a3bc9 --- /dev/null +++ b/lib/nlp_ruby/stringutil.rb @@ -0,0 +1,34 @@ +# whitespace 'tokenizer' +def tokenize s + s.strip.split +end + +def splitpipe s + s.strip.split(/\s*\|\|\|\s*/) +end + +def downcase? s + s[/[[:lower:]]/] +end + +# iterator over n-grams +def ngrams(s, n, fix=false) + a = tokenize s + a.each_with_index { |tok, i| + tok.strip! + 0.upto([n-1, a.size-i-1].min) { |m| + yield a[i..i+m] if !fix||(fix&&a[i..i+m].size==n) + } + } +end + +# a=1.0 b=2.0 => { 'a' => 1.0, 'b' => 2.0 } +def read_feature_string s + map = SparseVector.new + tokenize(s).each { |i| + key, value = i.split '=' + map[key] = value.to_f + } + return map +end + diff --git a/lib/nlp_ruby/tfidf.rb b/lib/nlp_ruby/tfidf.rb new file mode 100644 index 0000000..84d55a5 --- /dev/null +++ b/lib/nlp_ruby/tfidf.rb @@ -0,0 +1,32 @@ +# returns key='raw frequency' for an +# array-like object +def tf array, stopwords=[] + v = {}; v.default = 0 + array.uniq.each { |i| + next if stopwords.include? i + v[i] = array.count(i).to_f + } + return v +end + +# smoothes raw frequencies of tf() in-place +# a is a smoothing term +def ntf hash, a=0.4 + max = hash.values.max.to_f + hash.each_pair { |k,v| + hash[k] = a + (1-a)*(v/max) + } +end + +# returns idf value for each word in a vocabulary +def idf list_of_hashes + vocab = list_of_hashes.values.flatten.uniq + n = list_of_hashes.size.to_f + idf = {} + vocab.each { |i| + df = list_of_hashes.values.flatten.count i + idf[i] = Math.log(n/df) + } + return idf +end + diff --git a/lib/nlp_ruby/ttable.rb b/lib/nlp_ruby/ttable.rb new file mode 100644 index 0000000..20b1412 --- /dev/null +++ b/lib/nlp_ruby/ttable.rb @@ -0,0 +1,17 @@ +# table['some French string'] = [Array of English strings] +def read_phrase_table fn + table = {} + f = ReadFile.new fn + while raw_rule = f.gets + french, english, features = splitpipe(raw_rule) + feature_map = read_feature_string(features) + if table.has_key? french + table[french] << [english, feature_map ] + else + table[french] = [[english, feature_map]] + end + end + f.close + return table +end + diff --git a/lib/rubynlp.rb b/lib/rubynlp.rb deleted file mode 100644 index e69de29..0000000 --- a/lib/rubynlp.rb +++ /dev/null |