diff options
-rw-r--r-- | lib/nlp_ruby/SparseVector.rb | 57 | ||||
-rw-r--r-- | lib/nlp_ruby/Translation.rb | 10 | ||||
-rw-r--r-- | lib/nlp_ruby/dag.rb | 24 | ||||
-rw-r--r-- | lib/nlp_ruby/misc.rb | 14 | ||||
-rw-r--r-- | lib/nlp_ruby/tfidf.rb | 12 |
5 files changed, 76 insertions, 41 deletions
diff --git a/lib/nlp_ruby/SparseVector.rb b/lib/nlp_ruby/SparseVector.rb index b80373c..3096412 100644 --- a/lib/nlp_ruby/SparseVector.rb +++ b/lib/nlp_ruby/SparseVector.rb @@ -12,14 +12,32 @@ class SparseVector < Hash a.each_with_index { |i,j| self[j] = i } end + def self.from_a a + v = SparseVector.new + v.from_a a + return v + end + def from_h h h.each_pair { |k,v| self[k] = v } end + def self.from_h h + v = SparseVector.new + v.from_h h + return v + end + def from_s s from_h eval(s) end + def self.from_s s + v = SparseVector.new + v.from_s s + return v + end + def to_kv sep='=', join=' ' a = [] self.each_pair { |k,v| @@ -35,6 +53,12 @@ class SparseVector < Hash } end + def self.from_kv s + v = SparseVector.new + v.from_kv s + return v + end + def from_file fn, sep='=' f = ReadFile.new(fn) while line = f.gets @@ -44,6 +68,12 @@ class SparseVector < Hash end end + def self.from_file fn, sep='=' + v = SparseVector.new + v.from_file fn, sep + return v + end + def join_keys other self.keys + other.keys end @@ -126,24 +156,17 @@ class SparseVector < Hash } return new end -end - - -module SparseVector - -def SparseVector::mean a - mean = SparseVector.new - a.each { |i| - i.each_pair { |k,v| - mean[k] += v + def self.mean a + mean = SparseVector.new + a.each { |i| + i.each_pair { |k,v| + mean[k] += v + } } - } - n = array_of_vectors.size.to_f - mean.each_pair { |k,v| mean[k] = v/n } - return mean -end - - + n = a.size.to_f + mean.each_pair { |k,v| mean[k] = v/n } + return mean + end end diff --git a/lib/nlp_ruby/Translation.rb b/lib/nlp_ruby/Translation.rb index 0c346a4..34effe0 100644 --- a/lib/nlp_ruby/Translation.rb +++ b/lib/nlp_ruby/Translation.rb @@ -1,5 +1,5 @@ class Translation - attr_accessor :id, :s, :raw, :f, :score, :rank, :other_score + attr_accessor :id, :s, :raw, :f, :scores, :rank def initialize id=nil, raw=nil, s=nil, f=nil, scores={}, rank=nil @id = id @@ -21,11 +21,17 @@ class Translation @s = raw end @id = id.to_i - @f = read_feature_string features + @f = SparseVector.from_kv features @scores['decoder'] = score.to_f @rank = rank end + def self.from_s s + t = self.new + t.from_s s + return t + end + def to_s include_features=true [@id, @s, @f.to_kv('=', ' '), @scores['decoder']].join(' ||| ') if include_features [@id, @s, @scores['decoder']].join(' ||| ') if !include_features diff --git a/lib/nlp_ruby/dag.rb b/lib/nlp_ruby/dag.rb index cca35c5..6f514c7 100644 --- a/lib/nlp_ruby/dag.rb +++ b/lib/nlp_ruby/dag.rb @@ -4,27 +4,27 @@ require 'json' class DAG::Node - attr_accessor :label, :edges, :incoming, :score, :mark + attr_accessor :label, :outgoing, :incoming, :score, :mark - def initialize label=nil, edges=[], incoming=[], score=nil + def initialize label=nil, outgoing=[], incoming=[], score=nil @label = label - @edges = edges # outgoing + @outgoing = outgoing @incoming = incoming @score = nil end def add_edge head, weight=0 exit if self==head # no self-cycles! - @edges << DAG::Edge.new(self, head, weight) - return @edges.last + @outgoing << DAG::Edge.new(self, head, weight) + return @outgoing.last end def to_s - "DAG::Node<label:#{label}, edges:#{edges.size}, incoming:#{incoming.size}>" + "DAG::Node<label:#{label}, outgoing:#{outgoing.size}, incoming:#{incoming.size}>" end def repr - "#{to_s} #{@score} out:#{@edges} in:[#{@incoming.map{|e| e.to_s}.join ', '}]" + "#{to_s} #{@score} out:#{@outgoing} in:[#{@incoming.map{|e| e.to_s}.join ', '}]" end end @@ -50,7 +50,7 @@ end # w/o markings as we do not have cycles def DAG::dfs n, target_label return n if n.label==target_label # assumes uniq labels! - stack = n.edges.map { |i| i.head } + stack = n.outgoing.map { |i| i.head } while !stack.empty? m = stack.pop return DAG::dfs m, target_label @@ -65,7 +65,7 @@ def DAG::bfs n, target_label while !queue.empty? m = queue.shift return m if m.label==target_label - m.edges.each { |e| queue << e.head } + m.outgoing.each { |e| queue << e.head } end return nil end @@ -76,7 +76,7 @@ def DAG::topological_sort graph s = graph.reject { |n| !n.incoming.empty? } while !s.empty? sorted << s.shift - sorted.last.edges.each { |e| + sorted.last.outgoing.each { |e| e.mark = true s << e.head if e.head.incoming.reject{|f| f.mark}.empty? } @@ -110,7 +110,7 @@ def DAG::viterbi_forward graph, semiring=ViterbiSemiring, source_node toposorted = DAG::topological_sort(graph) DAG::init(graph, semiring, source_node) toposorted.each { |n| - n.edges.each { |e| + n.outgoing.each { |e| e.head.score = \ semiring.add.call(e.head.score, \ semiring.multiply.call(n.score, e.weight) @@ -127,7 +127,7 @@ def DAG::dijkstra graph, semiring=RealSemiring.new, source_node q = PriorityQueue.new graph while !q.empty? n = q.pop - n.edges.each { |e| + n.outgoing.each { |e| e.head.score = \ semiring.add.call(e.head.score, \ semiring.multiply.call(n.score, e.weight)) diff --git a/lib/nlp_ruby/misc.rb b/lib/nlp_ruby/misc.rb index 0f58100..b2ab885 100644 --- a/lib/nlp_ruby/misc.rb +++ b/lib/nlp_ruby/misc.rb @@ -1,3 +1,6 @@ +require 'timeout' + + class Array def max_index self.index(self.max) @@ -23,8 +26,8 @@ end class String - def downcase? s - s[/[[:lower:]]/] + def downcase? + self[/[[:lower:]]/] end end @@ -56,16 +59,13 @@ class PriorityQueue end def spawn_with_timeout cmd, t=4, debug=false - require 'timeout' STDERR.write cmd+"\n" if debug pipe_in, pipe_out = IO.pipe pid = Process.spawn(cmd, :out => pipe_out) begin Timeout.timeout(t) { Process.wait pid } rescue Timeout::Error - return "" - # accept the zombies - #Process.kill('TERM', pid) + Process.kill('TERM', pid) end pipe_out.close return pipe_in.read @@ -76,7 +76,7 @@ def read_phrase_table fn f = ReadFile.new fn while raw_rule = f.gets french, english, features = splitpipe(raw_rule) - feature_map = read_feature_string(features) + feature_map = SparseVector.from_kv features if table.has_key? french table[french] << [english, feature_map ] else diff --git a/lib/nlp_ruby/tfidf.rb b/lib/nlp_ruby/tfidf.rb index 84d55a5..13a40a3 100644 --- a/lib/nlp_ruby/tfidf.rb +++ b/lib/nlp_ruby/tfidf.rb @@ -1,6 +1,9 @@ +module TFIDF + + # returns key='raw frequency' for an # array-like object -def tf array, stopwords=[] +def TFIDF::tf array, stopwords=[] v = {}; v.default = 0 array.uniq.each { |i| next if stopwords.include? i @@ -11,7 +14,7 @@ end # smoothes raw frequencies of tf() in-place # a is a smoothing term -def ntf hash, a=0.4 +def TFIDF::ntf hash, a=0.4 max = hash.values.max.to_f hash.each_pair { |k,v| hash[k] = a + (1-a)*(v/max) @@ -19,7 +22,7 @@ def ntf hash, a=0.4 end # returns idf value for each word in a vocabulary -def idf list_of_hashes +def TFIDF::idf list_of_hashes vocab = list_of_hashes.values.flatten.uniq n = list_of_hashes.size.to_f idf = {} @@ -30,3 +33,6 @@ def idf list_of_hashes return idf end + +end #module + |