diff options
-rw-r--r-- | LICENSE | 22 | ||||
-rw-r--r-- | README.md | 18 | ||||
-rwxr-xr-x | lib/nlp_ruby.rb | 6 | ||||
-rw-r--r-- | lib/nlp_ruby/PriorityQueue.rb | 37 | ||||
-rw-r--r-- | lib/nlp_ruby/SparseVector.rb | 71 | ||||
-rw-r--r-- | lib/nlp_ruby/Translation.rb (renamed from lib/nlp_ruby/ttable.rb) | 33 | ||||
-rw-r--r-- | lib/nlp_ruby/bleu.rb | 12 | ||||
-rw-r--r-- | lib/nlp_ruby/cdec.rb | 20 | ||||
-rw-r--r-- | lib/nlp_ruby/dag.rb (renamed from lib/nlp_ruby/dags.rb) | 15 | ||||
-rw-r--r-- | lib/nlp_ruby/misc.rb | 74 | ||||
-rw-r--r-- | lib/nlp_ruby/semirings.rb | 3 | ||||
-rw-r--r-- | lib/nlp_ruby/stringutil.rb | 41 |
12 files changed, 161 insertions, 191 deletions
@@ -1,7 +1,23 @@ +The MIT License + Copyright (C) 2014 Patrick Simianer <p ät simianer.de> +http://simianer.de + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: -Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. -The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. @@ -4,13 +4,13 @@ nlp_ruby My little NLP library, supposed to make _my_ work a little easier and less redundant. The .gem can be found here: https://rubygems.org/gems/nlp_ruby -* dags.rb : implementation of a directed acyclic graph and various algorithms -* fileutil.rb : file utilities -* PriorityQueue.rb : a simple priority queue -* semirings.rb : semirings for dags.rb -* SparseVector.rb : sparse vectors for ruby, based on Hash -* stringutil.rb : string utilities -* tfidf.rb : functions to calculate tf/ntf/idf -* ttable.rb : functions to read MT phrase tables -* Vector.rb : vector class based on Array + bleu.rb : BLEU implementation, also per-sentence-BLEU + dag.rb : implementation of a directed acyclic graph and various algorithms + fileutil.rb : file utilities + misc.rb : misc. stuff (e.g. monkey patches for Array and String) + semirings.rb : semirings (used in dags.rb) + SparseVector.rb : sparse vectors for ruby, based on Hash class + stringutil.rb : string utilities + tfidf.rb : functions to calculate tf/ntf/idf + Translation.rb : an object for diff --git a/lib/nlp_ruby.rb b/lib/nlp_ruby.rb index c7af97b..a3f9f1a 100755 --- a/lib/nlp_ruby.rb +++ b/lib/nlp_ruby.rb @@ -3,14 +3,12 @@ require 'nlp_ruby/stringutil' require 'nlp_ruby/fileutil' require 'nlp_ruby/SparseVector' -require 'nlp_ruby/PriorityQueue' require 'nlp_ruby/tfidf' -require 'nlp_ruby/ttable' -require 'nlp_ruby/dags' +require 'nlp_ruby/Translation' +require 'nlp_ruby/dag' require 'nlp_ruby/semirings' require 'nlp_ruby/bleu' require 'nlp_ruby/misc' -require 'nlp_ruby/cdec' STDIN.set_encoding 'utf-8' STDOUT.set_encoding 'utf-8' diff --git a/lib/nlp_ruby/PriorityQueue.rb b/lib/nlp_ruby/PriorityQueue.rb deleted file mode 100644 index f090e60..0000000 --- a/lib/nlp_ruby/PriorityQueue.rb +++ /dev/null @@ -1,37 +0,0 @@ -# FIXME dags -# this assumes that elements in the queue -# have a numerical member named 'score' -class PriorityQueue - - def initialize a=Array.new - @queue = Array.new a - sort! - end - - def sort! - @queue.sort_by! { |i| -i.score } - end - - def pop - @queue.pop - end - - def push i - @queue << i - sort! - end - - def empty? - @queue.empty? - end - - # FIXME - def to_s - a = [] - @queue.each { |i| - a << "#{i.to_s}[#{i.score}]" - } - "[#{a.join ', '}]" - end -end - diff --git a/lib/nlp_ruby/SparseVector.rb b/lib/nlp_ruby/SparseVector.rb index 1c0262b..b80373c 100644 --- a/lib/nlp_ruby/SparseVector.rb +++ b/lib/nlp_ruby/SparseVector.rb @@ -20,6 +20,34 @@ class SparseVector < Hash from_h eval(s) end + def to_kv sep='=', join=' ' + a = [] + self.each_pair { |k,v| + a << "#{k}#{sep}#{v}" + } + return a.join join + end + + def from_kv s + s.split.each { |i| + k,v = i.split('=') + self[k] = v.to_f + } + end + + def from_file fn, sep='=' + f = ReadFile.new(fn) + while line = f.gets + key, value = line.strip.split sep + value = value.to_f + self[key] = value + end + end + + def join_keys other + self.keys + other.keys + end + def sum self.values.inject(:+) end @@ -74,38 +102,6 @@ class SparseVector < Hash return Math.sqrt(sum) end - # FIXME - def from_kv_file fn, sep=' ' - f = ReadFile.new(fn) - while line = f.gets - key, value = line.strip.split sep - value = value.to_f - self[key] = value - end - end - - # FIXME - def to_kv sep='=' - a = [] - self.each_pair { |k,v| - a << "#{k}#{sep}#{v}" - } - return a.join ' ' - end - - # FIXME - def to_kv2 sep='=' - a = [] - self.each_pair { |k,v| - a << "#{k}#{sep}#{v}" - } - return a.join "\n" - end - - def join_keys other - self.keys + other.keys - end - def + other new = SparseVector.new join_keys(other).each { |k| @@ -132,9 +128,13 @@ class SparseVector < Hash end end -def mean_sparse_vector array_of_vectors + +module SparseVector + + +def SparseVector::mean a mean = SparseVector.new - array_of_vectors.each { |i| + a.each { |i| i.each_pair { |k,v| mean[k] += v } @@ -144,3 +144,6 @@ def mean_sparse_vector array_of_vectors return mean end + +end + diff --git a/lib/nlp_ruby/ttable.rb b/lib/nlp_ruby/Translation.rb index c0f37be..0c346a4 100644 --- a/lib/nlp_ruby/ttable.rb +++ b/lib/nlp_ruby/Translation.rb @@ -1,32 +1,13 @@ -# table['some French string'] = [Array of English strings] -def read_phrase_table fn - table = {} - f = ReadFile.new fn - while raw_rule = f.gets - french, english, features = splitpipe(raw_rule) - feature_map = read_feature_string(features) - if table.has_key? french - table[french] << [english, feature_map ] - else - table[french] = [[english, feature_map]] - end - end - f.close - return table -end - -# FIXME class Translation attr_accessor :id, :s, :raw, :f, :score, :rank, :other_score - def initialize id=nil, raw=nil, s=nil, f=nil, score=nil, rank=nil, other_score=nil + def initialize id=nil, raw=nil, s=nil, f=nil, scores={}, rank=nil @id = id @raw = raw @s = s @f = f - @score = score + @scores = scores @rank = rank - @other_score = other_score end def from_s t, strip_alignment=true, rank=nil @@ -41,17 +22,17 @@ class Translation end @id = id.to_i @f = read_feature_string features - @score = score.to_f + @scores['decoder'] = score.to_f @rank = rank - @other_score = nil end - def to_s - [@id, @s, @f.to_kv, @score].join ' ||| ' + def to_s include_features=true + [@id, @s, @f.to_kv('=', ' '), @scores['decoder']].join(' ||| ') if include_features + [@id, @s, @scores['decoder']].join(' ||| ') if !include_features end def to_s2 - [@rank, @s, @score, @other_score].join ' ||| ' + [@rank, @s, @score, @scores.to_s].join ' ||| ' end end diff --git a/lib/nlp_ruby/bleu.rb b/lib/nlp_ruby/bleu.rb index ee91985..d7a6b2b 100644 --- a/lib/nlp_ruby/bleu.rb +++ b/lib/nlp_ruby/bleu.rb @@ -79,9 +79,9 @@ def BLEU::get_counts hypothesis, reference, n, times=1 return p end -def BLEU::brevity_penalty c, r +def BLEU::brevity_penalty c, r, hack=0.0 return 1.0 if c>r - return Math.exp(1-r/c) + return Math.exp 1.0-((r+hack)/c) end def BLEU::bleu counts, n, debug=false @@ -105,7 +105,7 @@ def BLEU::hbleu counts, n, debug=false (100*bleu(counts, n, debug)).round(3) end -def BLEU::per_sentence_bleu hypothesis, reference, n=4 +def BLEU::per_sentence_bleu hypothesis, reference, n=4, hack=0.0 h_ng = {}; r_ng = {} (1).upto(n) {|i| h_ng[i] = []; r_ng[i] = []} ngrams(hypothesis, n) {|i| h_ng[i.size] << i} @@ -117,13 +117,13 @@ def BLEU::per_sentence_bleu hypothesis, reference, n=4 (1).upto(m) { |i| counts_clipped = 0 counts_sum = h_ng[i].size - h_ng[i].uniq.each {|j| counts_clipped += r_ng[i].count(j)} + h_ng[i].uniq.each { |j| counts_clipped += r_ng[i].count(j) } add = 1.0 if i >= 2 sum += weight * Math.log((counts_clipped + add)/(counts_sum + add)); - } + } return brevity_penalty(hypothesis.strip.split.size, reference.strip.split.size) * Math.exp(sum) end -end +end # module diff --git a/lib/nlp_ruby/cdec.rb b/lib/nlp_ruby/cdec.rb deleted file mode 100644 index 1080f14..0000000 --- a/lib/nlp_ruby/cdec.rb +++ /dev/null @@ -1,20 +0,0 @@ -module CDEC - -require 'open3' - - -# FIXME -CDEC_BINARY = "/toolbox/cdec-dtrain/decoder/cdec" - - -def CDEC::kbest input, ini, weights, k, unique=true - o, s = Open3.capture2 "echo \"#{input}\" | #{CDEC_BINARY} -c #{ini} -w #{weights} -k #{k} -r 2>/dev/null" - j = -1 - ret = [] - o.split("\n").map{|i| j+=1; t=Translation.new; t.from_s(i, false, j); ret << t} - return ret -end - - -end - diff --git a/lib/nlp_ruby/dags.rb b/lib/nlp_ruby/dag.rb index 7767be1..cca35c5 100644 --- a/lib/nlp_ruby/dags.rb +++ b/lib/nlp_ruby/dag.rb @@ -1,21 +1,8 @@ -########################### -# TODO -# output paths -# visualization? -# algorithms: -# beam search -# best-first -# kbest -# kruskal (MST)? -# transitive closure? -########################### +module DAG require 'json' -module DAG - - class DAG::Node attr_accessor :label, :edges, :incoming, :score, :mark diff --git a/lib/nlp_ruby/misc.rb b/lib/nlp_ruby/misc.rb index 80d932c..0f58100 100644 --- a/lib/nlp_ruby/misc.rb +++ b/lib/nlp_ruby/misc.rb @@ -21,6 +21,40 @@ class Array end end +class String + + def downcase? s + s[/[[:lower:]]/] + end +end + +class PriorityQueue +# This assumes that elements in the queue +# have a numerical member named 'score'. + + def initialize a=Array.new + @queue = Array.new a + sort! + end + + def sort! + @queue.sort_by! { |i| -i.score } + end + + def pop + @queue.pop + end + + def push i + @queue << i + sort! + end + + def empty? + @queue.empty? + end +end + def spawn_with_timeout cmd, t=4, debug=false require 'timeout' STDERR.write cmd+"\n" if debug @@ -37,4 +71,44 @@ def spawn_with_timeout cmd, t=4, debug=false return pipe_in.read end +def read_phrase_table fn + table = {} + f = ReadFile.new fn + while raw_rule = f.gets + french, english, features = splitpipe(raw_rule) + feature_map = read_feature_string(features) + if table.has_key? french + table[french] << [english, feature_map ] + else + table[french] = [[english, feature_map]] + end + end + f.close + return table +end + +def cdec_kbest cdec_bin, input, ini, weights, k, unique=true + require 'open3' + cmd = "echo \"#{input}\" | #{cdec_bin} -c #{ini} -w #{weights} -k #{k}" + cmd += " -r" if unique + o,_ = Open3.capture2 "#{cmd} 2>/dev/null" + a = []; j = -1 + o.split("\n").map{ |i| j+=1; t=Translation.new; t.from_s(i, false, j); a << t } + return a +end + +def read_config fn + f = ReadFile.new fn + cfg = {} + while line = f.gets + line.strip! + next if /^\s*$/.match line + next if line[0]=='#' + content = line.split('#', 2).first + k, v = content.split(/\s*=\s*/, 2) + k.strip!; v.strip! + cfg[k] = v + end + return cfg +end diff --git a/lib/nlp_ruby/semirings.rb b/lib/nlp_ruby/semirings.rb index a06f151..83551a9 100644 --- a/lib/nlp_ruby/semirings.rb +++ b/lib/nlp_ruby/semirings.rb @@ -1,4 +1,5 @@ -# semirings for graphs as described in +# Semirings for directed acyclic graphs (dags) (also directed hypergraphs), +# as described in: # 'Dynamic Programming Algorithms in # Semiring and Hypergraph Frameworks' (Liang Huang) class Semiring diff --git a/lib/nlp_ruby/stringutil.rb b/lib/nlp_ruby/stringutil.rb index d7381bb..aa9be00 100644 --- a/lib/nlp_ruby/stringutil.rb +++ b/lib/nlp_ruby/stringutil.rb @@ -1,17 +1,7 @@ -# whitespace 'tokenizer' def tokenize s s.strip.split end -def splitpipe s, n=3 - s.strip.split("|"*n) -end - -def downcase? s - s[/[[:lower:]]/] -end - -# iterator over n-grams def ngrams(s, n, fix=false) a = tokenize s a.each_with_index { |tok, i| @@ -22,34 +12,11 @@ def ngrams(s, n, fix=false) } end -# a=1.0 b=2.0 => { 'a' => 1.0, 'b' => 2.0 } -def read_feature_string s - map = SparseVector.new - tokenize(s).each { |i| - key, value = i.split '=' - map[key] = value.to_f - } - return map -end - - -def read_cfg fn - f = ReadFile.new fn - cfg = {} - while line = f.gets - line.strip! - next if /^\s*$/.match line - next if line[0]=='#' - content = line.split('#', 2).first - k, v = content.split(/\s*=\s*/, 2) - k.strip!; v.strip! - cfg[k] = v - end - return cfg -end - def bag_of_words s, stopwords=[] s.strip.split.uniq.sort.reject{ |w| stopwords.include? w } -end +end +def splitpipe s, n=3 + s.strip.split("|"*n) +end |