diff options
Diffstat (limited to 'lib')
| -rwxr-xr-x | lib/nlp_ruby.rb | 6 | ||||
| -rw-r--r-- | lib/nlp_ruby/PriorityQueue.rb | 37 | ||||
| -rw-r--r-- | lib/nlp_ruby/SparseVector.rb | 71 | ||||
| -rw-r--r-- | lib/nlp_ruby/Translation.rb (renamed from lib/nlp_ruby/ttable.rb) | 33 | ||||
| -rw-r--r-- | lib/nlp_ruby/bleu.rb | 12 | ||||
| -rw-r--r-- | lib/nlp_ruby/cdec.rb | 20 | ||||
| -rw-r--r-- | lib/nlp_ruby/dag.rb (renamed from lib/nlp_ruby/dags.rb) | 15 | ||||
| -rw-r--r-- | lib/nlp_ruby/misc.rb | 74 | ||||
| -rw-r--r-- | lib/nlp_ruby/semirings.rb | 3 | ||||
| -rw-r--r-- | lib/nlp_ruby/stringutil.rb | 41 | 
10 files changed, 133 insertions, 179 deletions
| diff --git a/lib/nlp_ruby.rb b/lib/nlp_ruby.rb index c7af97b..a3f9f1a 100755 --- a/lib/nlp_ruby.rb +++ b/lib/nlp_ruby.rb @@ -3,14 +3,12 @@  require 'nlp_ruby/stringutil'  require 'nlp_ruby/fileutil'  require 'nlp_ruby/SparseVector' -require 'nlp_ruby/PriorityQueue'  require 'nlp_ruby/tfidf' -require 'nlp_ruby/ttable' -require 'nlp_ruby/dags' +require 'nlp_ruby/Translation' +require 'nlp_ruby/dag'  require 'nlp_ruby/semirings'  require 'nlp_ruby/bleu'  require 'nlp_ruby/misc' -require 'nlp_ruby/cdec'  STDIN.set_encoding 'utf-8'  STDOUT.set_encoding 'utf-8' diff --git a/lib/nlp_ruby/PriorityQueue.rb b/lib/nlp_ruby/PriorityQueue.rb deleted file mode 100644 index f090e60..0000000 --- a/lib/nlp_ruby/PriorityQueue.rb +++ /dev/null @@ -1,37 +0,0 @@ -# FIXME dags -# this assumes that elements in the queue -# have a numerical member named 'score' -class PriorityQueue - -  def initialize a=Array.new -    @queue = Array.new a -    sort! -  end - -  def sort! -    @queue.sort_by! { |i| -i.score } -  end - -  def pop -    @queue.pop -  end - -  def push i -    @queue << i -    sort! -  end - -  def empty? -    @queue.empty? -  end - -  # FIXME -  def to_s -    a = [] -    @queue.each { |i| -      a << "#{i.to_s}[#{i.score}]" -    } -    "[#{a.join ', '}]" -  end -end - diff --git a/lib/nlp_ruby/SparseVector.rb b/lib/nlp_ruby/SparseVector.rb index 1c0262b..b80373c 100644 --- a/lib/nlp_ruby/SparseVector.rb +++ b/lib/nlp_ruby/SparseVector.rb @@ -20,6 +20,34 @@ class SparseVector < Hash      from_h eval(s)    end +  def to_kv sep='=', join=' ' +    a = [] +    self.each_pair { |k,v| +      a << "#{k}#{sep}#{v}" +    } +    return a.join join +  end + +  def from_kv s +    s.split.each { |i| +      k,v = i.split('=') +      self[k] = v.to_f +    } +  end + +  def from_file fn, sep='=' +    f = ReadFile.new(fn) +    while line = f.gets +      key, value = line.strip.split sep +      value = value.to_f +      self[key] = value +    end +  end + +  def join_keys other +    self.keys + other.keys +  end +    def sum      self.values.inject(:+)    end @@ -74,38 +102,6 @@ class SparseVector < Hash      return Math.sqrt(sum)    end -  # FIXME -  def from_kv_file fn, sep=' ' -    f = ReadFile.new(fn) -    while line = f.gets -      key, value = line.strip.split sep -      value = value.to_f -      self[key] = value -    end -  end -   -  # FIXME -  def to_kv sep='=' -    a = [] -    self.each_pair { |k,v| -      a << "#{k}#{sep}#{v}" -    } -    return a.join ' ' -  end - -  # FIXME -  def to_kv2 sep='=' -    a = [] -    self.each_pair { |k,v| -      a << "#{k}#{sep}#{v}" -    } -    return a.join "\n" -  end - -  def join_keys other -    self.keys + other.keys -  end -    def + other      new = SparseVector.new      join_keys(other).each { |k| @@ -132,9 +128,13 @@ class SparseVector < Hash    end  end -def mean_sparse_vector array_of_vectors + +module SparseVector + + +def SparseVector::mean a    mean = SparseVector.new -  array_of_vectors.each { |i| +  a.each { |i|      i.each_pair { |k,v|        mean[k] += v      } @@ -144,3 +144,6 @@ def mean_sparse_vector array_of_vectors    return mean  end + +end + diff --git a/lib/nlp_ruby/ttable.rb b/lib/nlp_ruby/Translation.rb index c0f37be..0c346a4 100644 --- a/lib/nlp_ruby/ttable.rb +++ b/lib/nlp_ruby/Translation.rb @@ -1,32 +1,13 @@ -# table['some French string'] = [Array of English strings] -def read_phrase_table fn -  table = {} -  f = ReadFile.new fn -  while raw_rule = f.gets -    french, english, features = splitpipe(raw_rule) -    feature_map = read_feature_string(features) -    if table.has_key? french -      table[french] << [english, feature_map ] -    else -      table[french] = [[english, feature_map]] -    end -  end -  f.close -  return table -end - -# FIXME  class Translation    attr_accessor :id, :s, :raw, :f, :score, :rank, :other_score -  def initialize id=nil, raw=nil, s=nil, f=nil, score=nil, rank=nil, other_score=nil +  def initialize id=nil, raw=nil, s=nil, f=nil, scores={}, rank=nil      @id = id      @raw = raw      @s = s      @f = f -    @score = score +    @scores = scores      @rank = rank -    @other_score = other_score    end    def from_s t, strip_alignment=true, rank=nil @@ -41,17 +22,17 @@ class Translation      end      @id = id.to_i      @f = read_feature_string features -    @score = score.to_f +    @scores['decoder'] = score.to_f      @rank = rank -    @other_score = nil    end -  def to_s -    [@id, @s, @f.to_kv, @score].join ' ||| ' +  def to_s include_features=true +    [@id, @s, @f.to_kv('=', ' '), @scores['decoder']].join(' ||| ') if include_features +    [@id, @s, @scores['decoder']].join(' ||| ') if !include_features    end    def to_s2 -    [@rank, @s, @score, @other_score].join ' ||| ' +    [@rank, @s, @score, @scores.to_s].join ' ||| '    end  end diff --git a/lib/nlp_ruby/bleu.rb b/lib/nlp_ruby/bleu.rb index ee91985..d7a6b2b 100644 --- a/lib/nlp_ruby/bleu.rb +++ b/lib/nlp_ruby/bleu.rb @@ -79,9 +79,9 @@ def BLEU::get_counts hypothesis, reference, n, times=1    return p  end -def BLEU::brevity_penalty c, r +def BLEU::brevity_penalty c, r, hack=0.0    return 1.0 if c>r -  return Math.exp(1-r/c) +  return Math.exp 1.0-((r+hack)/c)  end  def BLEU::bleu counts, n, debug=false @@ -105,7 +105,7 @@ def BLEU::hbleu counts, n, debug=false    (100*bleu(counts, n, debug)).round(3)  end -def BLEU::per_sentence_bleu hypothesis, reference, n=4 +def BLEU::per_sentence_bleu hypothesis, reference, n=4, hack=0.0    h_ng = {}; r_ng = {}    (1).upto(n) {|i| h_ng[i] = []; r_ng[i] = []}    ngrams(hypothesis, n) {|i| h_ng[i.size] << i} @@ -117,13 +117,13 @@ def BLEU::per_sentence_bleu hypothesis, reference, n=4    (1).upto(m) { |i|      counts_clipped = 0      counts_sum = h_ng[i].size -    h_ng[i].uniq.each {|j| counts_clipped += r_ng[i].count(j)} +    h_ng[i].uniq.each { |j| counts_clipped += r_ng[i].count(j) }      add = 1.0 if i >= 2      sum += weight * Math.log((counts_clipped + add)/(counts_sum + add)); -  }  +  }    return brevity_penalty(hypothesis.strip.split.size, reference.strip.split.size) * Math.exp(sum)  end -end +end # module diff --git a/lib/nlp_ruby/cdec.rb b/lib/nlp_ruby/cdec.rb deleted file mode 100644 index 1080f14..0000000 --- a/lib/nlp_ruby/cdec.rb +++ /dev/null @@ -1,20 +0,0 @@ -module CDEC - -require 'open3' - - -# FIXME -CDEC_BINARY = "/toolbox/cdec-dtrain/decoder/cdec" - - -def CDEC::kbest input, ini, weights, k, unique=true -  o, s = Open3.capture2 "echo \"#{input}\" | #{CDEC_BINARY} -c #{ini} -w #{weights} -k #{k} -r  2>/dev/null" -  j = -1 -  ret = [] -  o.split("\n").map{|i| j+=1; t=Translation.new; t.from_s(i, false, j); ret << t} -  return ret -end - - -end - diff --git a/lib/nlp_ruby/dags.rb b/lib/nlp_ruby/dag.rb index 7767be1..cca35c5 100644 --- a/lib/nlp_ruby/dags.rb +++ b/lib/nlp_ruby/dag.rb @@ -1,21 +1,8 @@ -########################### -# TODO -# output paths -# visualization? -# algorithms: -#  beam search -#  best-first -#  kbest -#  kruskal (MST)? -#  transitive closure? -########################### +module DAG  require 'json' -module DAG - -  class DAG::Node    attr_accessor :label, :edges, :incoming, :score, :mark diff --git a/lib/nlp_ruby/misc.rb b/lib/nlp_ruby/misc.rb index 80d932c..0f58100 100644 --- a/lib/nlp_ruby/misc.rb +++ b/lib/nlp_ruby/misc.rb @@ -21,6 +21,40 @@ class Array    end  end +class String + +  def downcase? s +    s[/[[:lower:]]/] +  end +end + +class PriorityQueue +# This assumes that elements in the queue +# have a numerical member named 'score'. + +  def initialize a=Array.new +    @queue = Array.new a +    sort! +  end + +  def sort! +    @queue.sort_by! { |i| -i.score } +  end + +  def pop +    @queue.pop +  end + +  def push i +    @queue << i +    sort! +  end + +  def empty? +    @queue.empty? +  end +end +  def spawn_with_timeout cmd, t=4, debug=false    require 'timeout'    STDERR.write cmd+"\n" if debug @@ -37,4 +71,44 @@ def spawn_with_timeout cmd, t=4, debug=false    return pipe_in.read  end +def read_phrase_table fn +  table = {} +  f = ReadFile.new fn +  while raw_rule = f.gets +    french, english, features = splitpipe(raw_rule) +    feature_map = read_feature_string(features) +    if table.has_key? french +      table[french] << [english, feature_map ] +    else +      table[french] = [[english, feature_map]] +    end +  end +  f.close +  return table +end + +def cdec_kbest cdec_bin, input, ini, weights, k, unique=true +  require 'open3' +  cmd = "echo \"#{input}\" | #{cdec_bin} -c #{ini} -w #{weights} -k #{k}" +  cmd += " -r" if unique +  o,_ = Open3.capture2 "#{cmd}  2>/dev/null" +  a = []; j = -1 +  o.split("\n").map{ |i| j+=1; t=Translation.new; t.from_s(i, false, j); a << t } +  return a +end + +def read_config fn +  f = ReadFile.new fn +  cfg = {} +  while line = f.gets +    line.strip! +    next if /^\s*$/.match line +    next if line[0]=='#' +    content = line.split('#', 2).first +    k, v = content.split(/\s*=\s*/, 2) +    k.strip!; v.strip! +    cfg[k] = v +  end +  return cfg +end diff --git a/lib/nlp_ruby/semirings.rb b/lib/nlp_ruby/semirings.rb index a06f151..83551a9 100644 --- a/lib/nlp_ruby/semirings.rb +++ b/lib/nlp_ruby/semirings.rb @@ -1,4 +1,5 @@ -# semirings for graphs as described in +# Semirings for directed acyclic graphs (dags) (also directed hypergraphs), +# as described in:  # 'Dynamic Programming Algorithms in  #  Semiring and Hypergraph Frameworks' (Liang Huang)  class Semiring diff --git a/lib/nlp_ruby/stringutil.rb b/lib/nlp_ruby/stringutil.rb index d7381bb..aa9be00 100644 --- a/lib/nlp_ruby/stringutil.rb +++ b/lib/nlp_ruby/stringutil.rb @@ -1,17 +1,7 @@ -# whitespace 'tokenizer'  def tokenize s    s.strip.split  end -def splitpipe s, n=3 -  s.strip.split("|"*n) -end - -def downcase? s -  s[/[[:lower:]]/] -end - -# iterator over n-grams  def ngrams(s, n, fix=false)    a = tokenize s    a.each_with_index { |tok, i| @@ -22,34 +12,11 @@ def ngrams(s, n, fix=false)    }  end -# a=1.0 b=2.0 => { 'a' => 1.0, 'b' => 2.0 } -def read_feature_string s -  map = SparseVector.new -  tokenize(s).each { |i| -    key, value = i.split '=' -    map[key] = value.to_f -  } -  return map -end - - -def read_cfg fn -  f = ReadFile.new fn -  cfg = {} -  while line = f.gets -    line.strip! -    next if /^\s*$/.match line -    next if line[0]=='#' -    content = line.split('#', 2).first -    k, v = content.split(/\s*=\s*/, 2) -    k.strip!; v.strip! -    cfg[k] = v -  end -  return cfg -end -  def bag_of_words s, stopwords=[]    s.strip.split.uniq.sort.reject{ |w| stopwords.include? w } -end  +end +def splitpipe s, n=3 +  s.strip.split("|"*n) +end | 
