summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2014-02-14 18:56:47 +0100
committerPatrick Simianer <p@simianer.de>2014-02-14 18:56:47 +0100
commit062eda911830c779aa685885b8e15ecceabfc085 (patch)
tree18f02b6e327d211dd717173161024203d9630f57
parent0347cbe4157bb4721e58342243272e0515d286ba (diff)
some class methods; Translation scores dict; DAG edges->outgoing; TFIDF module
-rw-r--r--lib/nlp_ruby/SparseVector.rb57
-rw-r--r--lib/nlp_ruby/Translation.rb10
-rw-r--r--lib/nlp_ruby/dag.rb24
-rw-r--r--lib/nlp_ruby/misc.rb14
-rw-r--r--lib/nlp_ruby/tfidf.rb12
5 files changed, 76 insertions, 41 deletions
diff --git a/lib/nlp_ruby/SparseVector.rb b/lib/nlp_ruby/SparseVector.rb
index b80373c..3096412 100644
--- a/lib/nlp_ruby/SparseVector.rb
+++ b/lib/nlp_ruby/SparseVector.rb
@@ -12,14 +12,32 @@ class SparseVector < Hash
a.each_with_index { |i,j| self[j] = i }
end
+ def self.from_a a
+ v = SparseVector.new
+ v.from_a a
+ return v
+ end
+
def from_h h
h.each_pair { |k,v| self[k] = v }
end
+ def self.from_h h
+ v = SparseVector.new
+ v.from_h h
+ return v
+ end
+
def from_s s
from_h eval(s)
end
+ def self.from_s s
+ v = SparseVector.new
+ v.from_s s
+ return v
+ end
+
def to_kv sep='=', join=' '
a = []
self.each_pair { |k,v|
@@ -35,6 +53,12 @@ class SparseVector < Hash
}
end
+ def self.from_kv s
+ v = SparseVector.new
+ v.from_kv s
+ return v
+ end
+
def from_file fn, sep='='
f = ReadFile.new(fn)
while line = f.gets
@@ -44,6 +68,12 @@ class SparseVector < Hash
end
end
+ def self.from_file fn, sep='='
+ v = SparseVector.new
+ v.from_file fn, sep
+ return v
+ end
+
def join_keys other
self.keys + other.keys
end
@@ -126,24 +156,17 @@ class SparseVector < Hash
}
return new
end
-end
-
-
-module SparseVector
-
-def SparseVector::mean a
- mean = SparseVector.new
- a.each { |i|
- i.each_pair { |k,v|
- mean[k] += v
+ def self.mean a
+ mean = SparseVector.new
+ a.each { |i|
+ i.each_pair { |k,v|
+ mean[k] += v
+ }
}
- }
- n = array_of_vectors.size.to_f
- mean.each_pair { |k,v| mean[k] = v/n }
- return mean
-end
-
-
+ n = a.size.to_f
+ mean.each_pair { |k,v| mean[k] = v/n }
+ return mean
+ end
end
diff --git a/lib/nlp_ruby/Translation.rb b/lib/nlp_ruby/Translation.rb
index 0c346a4..34effe0 100644
--- a/lib/nlp_ruby/Translation.rb
+++ b/lib/nlp_ruby/Translation.rb
@@ -1,5 +1,5 @@
class Translation
- attr_accessor :id, :s, :raw, :f, :score, :rank, :other_score
+ attr_accessor :id, :s, :raw, :f, :scores, :rank
def initialize id=nil, raw=nil, s=nil, f=nil, scores={}, rank=nil
@id = id
@@ -21,11 +21,17 @@ class Translation
@s = raw
end
@id = id.to_i
- @f = read_feature_string features
+ @f = SparseVector.from_kv features
@scores['decoder'] = score.to_f
@rank = rank
end
+ def self.from_s s
+ t = self.new
+ t.from_s s
+ return t
+ end
+
def to_s include_features=true
[@id, @s, @f.to_kv('=', ' '), @scores['decoder']].join(' ||| ') if include_features
[@id, @s, @scores['decoder']].join(' ||| ') if !include_features
diff --git a/lib/nlp_ruby/dag.rb b/lib/nlp_ruby/dag.rb
index cca35c5..6f514c7 100644
--- a/lib/nlp_ruby/dag.rb
+++ b/lib/nlp_ruby/dag.rb
@@ -4,27 +4,27 @@ require 'json'
class DAG::Node
- attr_accessor :label, :edges, :incoming, :score, :mark
+ attr_accessor :label, :outgoing, :incoming, :score, :mark
- def initialize label=nil, edges=[], incoming=[], score=nil
+ def initialize label=nil, outgoing=[], incoming=[], score=nil
@label = label
- @edges = edges # outgoing
+ @outgoing = outgoing
@incoming = incoming
@score = nil
end
def add_edge head, weight=0
exit if self==head # no self-cycles!
- @edges << DAG::Edge.new(self, head, weight)
- return @edges.last
+ @outgoing << DAG::Edge.new(self, head, weight)
+ return @outgoing.last
end
def to_s
- "DAG::Node<label:#{label}, edges:#{edges.size}, incoming:#{incoming.size}>"
+ "DAG::Node<label:#{label}, outgoing:#{outgoing.size}, incoming:#{incoming.size}>"
end
def repr
- "#{to_s} #{@score} out:#{@edges} in:[#{@incoming.map{|e| e.to_s}.join ', '}]"
+ "#{to_s} #{@score} out:#{@outgoing} in:[#{@incoming.map{|e| e.to_s}.join ', '}]"
end
end
@@ -50,7 +50,7 @@ end
# w/o markings as we do not have cycles
def DAG::dfs n, target_label
return n if n.label==target_label # assumes uniq labels!
- stack = n.edges.map { |i| i.head }
+ stack = n.outgoing.map { |i| i.head }
while !stack.empty?
m = stack.pop
return DAG::dfs m, target_label
@@ -65,7 +65,7 @@ def DAG::bfs n, target_label
while !queue.empty?
m = queue.shift
return m if m.label==target_label
- m.edges.each { |e| queue << e.head }
+ m.outgoing.each { |e| queue << e.head }
end
return nil
end
@@ -76,7 +76,7 @@ def DAG::topological_sort graph
s = graph.reject { |n| !n.incoming.empty? }
while !s.empty?
sorted << s.shift
- sorted.last.edges.each { |e|
+ sorted.last.outgoing.each { |e|
e.mark = true
s << e.head if e.head.incoming.reject{|f| f.mark}.empty?
}
@@ -110,7 +110,7 @@ def DAG::viterbi_forward graph, semiring=ViterbiSemiring, source_node
toposorted = DAG::topological_sort(graph)
DAG::init(graph, semiring, source_node)
toposorted.each { |n|
- n.edges.each { |e|
+ n.outgoing.each { |e|
e.head.score = \
semiring.add.call(e.head.score, \
semiring.multiply.call(n.score, e.weight)
@@ -127,7 +127,7 @@ def DAG::dijkstra graph, semiring=RealSemiring.new, source_node
q = PriorityQueue.new graph
while !q.empty?
n = q.pop
- n.edges.each { |e|
+ n.outgoing.each { |e|
e.head.score = \
semiring.add.call(e.head.score, \
semiring.multiply.call(n.score, e.weight))
diff --git a/lib/nlp_ruby/misc.rb b/lib/nlp_ruby/misc.rb
index 0f58100..b2ab885 100644
--- a/lib/nlp_ruby/misc.rb
+++ b/lib/nlp_ruby/misc.rb
@@ -1,3 +1,6 @@
+require 'timeout'
+
+
class Array
def max_index
self.index(self.max)
@@ -23,8 +26,8 @@ end
class String
- def downcase? s
- s[/[[:lower:]]/]
+ def downcase?
+ self[/[[:lower:]]/]
end
end
@@ -56,16 +59,13 @@ class PriorityQueue
end
def spawn_with_timeout cmd, t=4, debug=false
- require 'timeout'
STDERR.write cmd+"\n" if debug
pipe_in, pipe_out = IO.pipe
pid = Process.spawn(cmd, :out => pipe_out)
begin
Timeout.timeout(t) { Process.wait pid }
rescue Timeout::Error
- return ""
- # accept the zombies
- #Process.kill('TERM', pid)
+ Process.kill('TERM', pid)
end
pipe_out.close
return pipe_in.read
@@ -76,7 +76,7 @@ def read_phrase_table fn
f = ReadFile.new fn
while raw_rule = f.gets
french, english, features = splitpipe(raw_rule)
- feature_map = read_feature_string(features)
+ feature_map = SparseVector.from_kv features
if table.has_key? french
table[french] << [english, feature_map ]
else
diff --git a/lib/nlp_ruby/tfidf.rb b/lib/nlp_ruby/tfidf.rb
index 84d55a5..13a40a3 100644
--- a/lib/nlp_ruby/tfidf.rb
+++ b/lib/nlp_ruby/tfidf.rb
@@ -1,6 +1,9 @@
+module TFIDF
+
+
# returns key='raw frequency' for an
# array-like object
-def tf array, stopwords=[]
+def TFIDF::tf array, stopwords=[]
v = {}; v.default = 0
array.uniq.each { |i|
next if stopwords.include? i
@@ -11,7 +14,7 @@ end
# smoothes raw frequencies of tf() in-place
# a is a smoothing term
-def ntf hash, a=0.4
+def TFIDF::ntf hash, a=0.4
max = hash.values.max.to_f
hash.each_pair { |k,v|
hash[k] = a + (1-a)*(v/max)
@@ -19,7 +22,7 @@ def ntf hash, a=0.4
end
# returns idf value for each word in a vocabulary
-def idf list_of_hashes
+def TFIDF::idf list_of_hashes
vocab = list_of_hashes.values.flatten.uniq
n = list_of_hashes.size.to_f
idf = {}
@@ -30,3 +33,6 @@ def idf list_of_hashes
return idf
end
+
+end #module
+