diff options
| -rw-r--r-- | lib/nlp_ruby/SparseVector.rb | 57 | ||||
| -rw-r--r-- | lib/nlp_ruby/Translation.rb | 10 | ||||
| -rw-r--r-- | lib/nlp_ruby/dag.rb | 24 | ||||
| -rw-r--r-- | lib/nlp_ruby/misc.rb | 14 | ||||
| -rw-r--r-- | lib/nlp_ruby/tfidf.rb | 12 | 
5 files changed, 76 insertions, 41 deletions
| diff --git a/lib/nlp_ruby/SparseVector.rb b/lib/nlp_ruby/SparseVector.rb index b80373c..3096412 100644 --- a/lib/nlp_ruby/SparseVector.rb +++ b/lib/nlp_ruby/SparseVector.rb @@ -12,14 +12,32 @@ class SparseVector < Hash      a.each_with_index { |i,j| self[j] = i }    end +  def self.from_a a +    v = SparseVector.new +    v.from_a a +    return v +  end +    def from_h h      h.each_pair { |k,v| self[k] = v }    end +  def self.from_h h +    v = SparseVector.new +    v.from_h h +    return v +  end +    def from_s s      from_h eval(s)    end +  def self.from_s s +    v = SparseVector.new +    v.from_s s +    return v +  end +    def to_kv sep='=', join=' '      a = []      self.each_pair { |k,v| @@ -35,6 +53,12 @@ class SparseVector < Hash      }    end +  def self.from_kv s +    v = SparseVector.new +    v.from_kv s +    return v +  end +    def from_file fn, sep='='      f = ReadFile.new(fn)      while line = f.gets @@ -44,6 +68,12 @@ class SparseVector < Hash      end    end +  def self.from_file fn, sep='=' +    v = SparseVector.new +    v.from_file fn, sep +    return v +  end +    def join_keys other      self.keys + other.keys    end @@ -126,24 +156,17 @@ class SparseVector < Hash      }      return new    end -end - - -module SparseVector - -def SparseVector::mean a -  mean = SparseVector.new -  a.each { |i| -    i.each_pair { |k,v| -      mean[k] += v +  def self.mean a +    mean = SparseVector.new +    a.each { |i| +      i.each_pair { |k,v| +        mean[k] += v +      }      } -  } -  n = array_of_vectors.size.to_f -  mean.each_pair { |k,v| mean[k] = v/n } -  return mean -end - - +    n = a.size.to_f +    mean.each_pair { |k,v| mean[k] = v/n } +    return mean +  end  end diff --git a/lib/nlp_ruby/Translation.rb b/lib/nlp_ruby/Translation.rb index 0c346a4..34effe0 100644 --- a/lib/nlp_ruby/Translation.rb +++ b/lib/nlp_ruby/Translation.rb @@ -1,5 +1,5 @@  class Translation -  attr_accessor :id, :s, :raw, :f, :score, :rank, :other_score +  attr_accessor :id, :s, :raw, :f, :scores, :rank    def initialize id=nil, raw=nil, s=nil, f=nil, scores={}, rank=nil      @id = id @@ -21,11 +21,17 @@ class Translation        @s = raw      end      @id = id.to_i -    @f = read_feature_string features +    @f = SparseVector.from_kv features      @scores['decoder'] = score.to_f      @rank = rank    end +  def self.from_s s +    t = self.new +    t.from_s s +    return t +  end +    def to_s include_features=true      [@id, @s, @f.to_kv('=', ' '), @scores['decoder']].join(' ||| ') if include_features      [@id, @s, @scores['decoder']].join(' ||| ') if !include_features diff --git a/lib/nlp_ruby/dag.rb b/lib/nlp_ruby/dag.rb index cca35c5..6f514c7 100644 --- a/lib/nlp_ruby/dag.rb +++ b/lib/nlp_ruby/dag.rb @@ -4,27 +4,27 @@ require 'json'  class DAG::Node -  attr_accessor :label, :edges, :incoming, :score, :mark +  attr_accessor :label, :outgoing, :incoming, :score, :mark -  def initialize label=nil, edges=[], incoming=[], score=nil +  def initialize label=nil, outgoing=[], incoming=[], score=nil      @label    = label -    @edges    = edges # outgoing +    @outgoing = outgoing      @incoming = incoming      @score    = nil    end    def add_edge head, weight=0      exit if self==head # no self-cycles! -    @edges << DAG::Edge.new(self, head, weight) -    return @edges.last +    @outgoing << DAG::Edge.new(self, head, weight) +    return @outgoing.last    end    def to_s -    "DAG::Node<label:#{label}, edges:#{edges.size}, incoming:#{incoming.size}>" +    "DAG::Node<label:#{label}, outgoing:#{outgoing.size}, incoming:#{incoming.size}>"    end    def repr -    "#{to_s} #{@score} out:#{@edges} in:[#{@incoming.map{|e| e.to_s}.join ', '}]" +    "#{to_s} #{@score} out:#{@outgoing} in:[#{@incoming.map{|e| e.to_s}.join ', '}]"    end  end @@ -50,7 +50,7 @@ end  #  w/o markings as we do not have cycles  def DAG::dfs n, target_label    return n if n.label==target_label # assumes uniq labels! -  stack = n.edges.map { |i| i.head } +  stack = n.outgoing.map { |i| i.head }    while !stack.empty?      m = stack.pop      return DAG::dfs m, target_label @@ -65,7 +65,7 @@ def DAG::bfs n, target_label    while !queue.empty?      m = queue.shift      return m if m.label==target_label -    m.edges.each { |e| queue << e.head } +    m.outgoing.each { |e| queue << e.head }    end    return nil  end @@ -76,7 +76,7 @@ def DAG::topological_sort graph    s = graph.reject { |n| !n.incoming.empty? }    while !s.empty?      sorted << s.shift -    sorted.last.edges.each { |e| +    sorted.last.outgoing.each { |e|        e.mark = true        s << e.head if e.head.incoming.reject{|f| f.mark}.empty?      } @@ -110,7 +110,7 @@ def DAG::viterbi_forward graph, semiring=ViterbiSemiring, source_node    toposorted = DAG::topological_sort(graph)    DAG::init(graph, semiring, source_node)    toposorted.each { |n| -    n.edges.each { |e| +    n.outgoing.each { |e|        e.head.score = \          semiring.add.call(e.head.score, \                            semiring.multiply.call(n.score, e.weight) @@ -127,7 +127,7 @@ def DAG::dijkstra graph, semiring=RealSemiring.new, source_node    q = PriorityQueue.new graph    while !q.empty?      n = q.pop -    n.edges.each { |e| +    n.outgoing.each { |e|        e.head.score = \          semiring.add.call(e.head.score, \                            semiring.multiply.call(n.score, e.weight)) diff --git a/lib/nlp_ruby/misc.rb b/lib/nlp_ruby/misc.rb index 0f58100..b2ab885 100644 --- a/lib/nlp_ruby/misc.rb +++ b/lib/nlp_ruby/misc.rb @@ -1,3 +1,6 @@ +require 'timeout' + +  class Array    def max_index      self.index(self.max) @@ -23,8 +26,8 @@ end  class String -  def downcase? s -    s[/[[:lower:]]/] +  def downcase? +    self[/[[:lower:]]/]    end  end @@ -56,16 +59,13 @@ class PriorityQueue  end  def spawn_with_timeout cmd, t=4, debug=false -  require 'timeout'    STDERR.write cmd+"\n" if debug    pipe_in, pipe_out = IO.pipe    pid = Process.spawn(cmd, :out => pipe_out)    begin      Timeout.timeout(t) { Process.wait pid }    rescue Timeout::Error -    return "" -    # accept the zombies -    #Process.kill('TERM', pid) +    Process.kill('TERM', pid)    end    pipe_out.close    return pipe_in.read @@ -76,7 +76,7 @@ def read_phrase_table fn    f = ReadFile.new fn    while raw_rule = f.gets      french, english, features = splitpipe(raw_rule) -    feature_map = read_feature_string(features) +    feature_map = SparseVector.from_kv  features      if table.has_key? french        table[french] << [english, feature_map ]      else diff --git a/lib/nlp_ruby/tfidf.rb b/lib/nlp_ruby/tfidf.rb index 84d55a5..13a40a3 100644 --- a/lib/nlp_ruby/tfidf.rb +++ b/lib/nlp_ruby/tfidf.rb @@ -1,6 +1,9 @@ +module TFIDF + +  # returns key='raw frequency' for an  # array-like object -def tf array, stopwords=[] +def TFIDF::tf array, stopwords=[]    v = {}; v.default = 0    array.uniq.each { |i|     next if stopwords.include? i @@ -11,7 +14,7 @@ end  # smoothes raw frequencies of tf() in-place  # a is a smoothing term -def ntf hash, a=0.4 +def TFIDF::ntf hash, a=0.4    max = hash.values.max.to_f    hash.each_pair { |k,v|      hash[k] = a + (1-a)*(v/max) @@ -19,7 +22,7 @@ def ntf hash, a=0.4  end  # returns idf value for each word in a vocabulary -def idf list_of_hashes +def TFIDF::idf list_of_hashes    vocab = list_of_hashes.values.flatten.uniq    n = list_of_hashes.size.to_f    idf = {} @@ -30,3 +33,6 @@ def idf list_of_hashes    return idf  end + +end #module + | 
