diff options
Diffstat (limited to 'lib')
-rwxr-xr-x | lib/nlp_ruby.rb | 6 | ||||
-rw-r--r-- | lib/nlp_ruby/PriorityQueue.rb | 37 | ||||
-rw-r--r-- | lib/nlp_ruby/SparseVector.rb | 71 | ||||
-rw-r--r-- | lib/nlp_ruby/Translation.rb (renamed from lib/nlp_ruby/ttable.rb) | 33 | ||||
-rw-r--r-- | lib/nlp_ruby/bleu.rb | 12 | ||||
-rw-r--r-- | lib/nlp_ruby/cdec.rb | 20 | ||||
-rw-r--r-- | lib/nlp_ruby/dag.rb (renamed from lib/nlp_ruby/dags.rb) | 15 | ||||
-rw-r--r-- | lib/nlp_ruby/misc.rb | 74 | ||||
-rw-r--r-- | lib/nlp_ruby/semirings.rb | 3 | ||||
-rw-r--r-- | lib/nlp_ruby/stringutil.rb | 41 |
10 files changed, 133 insertions, 179 deletions
diff --git a/lib/nlp_ruby.rb b/lib/nlp_ruby.rb index c7af97b..a3f9f1a 100755 --- a/lib/nlp_ruby.rb +++ b/lib/nlp_ruby.rb @@ -3,14 +3,12 @@ require 'nlp_ruby/stringutil' require 'nlp_ruby/fileutil' require 'nlp_ruby/SparseVector' -require 'nlp_ruby/PriorityQueue' require 'nlp_ruby/tfidf' -require 'nlp_ruby/ttable' -require 'nlp_ruby/dags' +require 'nlp_ruby/Translation' +require 'nlp_ruby/dag' require 'nlp_ruby/semirings' require 'nlp_ruby/bleu' require 'nlp_ruby/misc' -require 'nlp_ruby/cdec' STDIN.set_encoding 'utf-8' STDOUT.set_encoding 'utf-8' diff --git a/lib/nlp_ruby/PriorityQueue.rb b/lib/nlp_ruby/PriorityQueue.rb deleted file mode 100644 index f090e60..0000000 --- a/lib/nlp_ruby/PriorityQueue.rb +++ /dev/null @@ -1,37 +0,0 @@ -# FIXME dags -# this assumes that elements in the queue -# have a numerical member named 'score' -class PriorityQueue - - def initialize a=Array.new - @queue = Array.new a - sort! - end - - def sort! - @queue.sort_by! { |i| -i.score } - end - - def pop - @queue.pop - end - - def push i - @queue << i - sort! - end - - def empty? - @queue.empty? - end - - # FIXME - def to_s - a = [] - @queue.each { |i| - a << "#{i.to_s}[#{i.score}]" - } - "[#{a.join ', '}]" - end -end - diff --git a/lib/nlp_ruby/SparseVector.rb b/lib/nlp_ruby/SparseVector.rb index 1c0262b..b80373c 100644 --- a/lib/nlp_ruby/SparseVector.rb +++ b/lib/nlp_ruby/SparseVector.rb @@ -20,6 +20,34 @@ class SparseVector < Hash from_h eval(s) end + def to_kv sep='=', join=' ' + a = [] + self.each_pair { |k,v| + a << "#{k}#{sep}#{v}" + } + return a.join join + end + + def from_kv s + s.split.each { |i| + k,v = i.split('=') + self[k] = v.to_f + } + end + + def from_file fn, sep='=' + f = ReadFile.new(fn) + while line = f.gets + key, value = line.strip.split sep + value = value.to_f + self[key] = value + end + end + + def join_keys other + self.keys + other.keys + end + def sum self.values.inject(:+) end @@ -74,38 +102,6 @@ class SparseVector < Hash return Math.sqrt(sum) end - # FIXME - def from_kv_file fn, sep=' ' - f = ReadFile.new(fn) - while line = f.gets - key, value = line.strip.split sep - value = value.to_f - self[key] = value - end - end - - # FIXME - def to_kv sep='=' - a = [] - self.each_pair { |k,v| - a << "#{k}#{sep}#{v}" - } - return a.join ' ' - end - - # FIXME - def to_kv2 sep='=' - a = [] - self.each_pair { |k,v| - a << "#{k}#{sep}#{v}" - } - return a.join "\n" - end - - def join_keys other - self.keys + other.keys - end - def + other new = SparseVector.new join_keys(other).each { |k| @@ -132,9 +128,13 @@ class SparseVector < Hash end end -def mean_sparse_vector array_of_vectors + +module SparseVector + + +def SparseVector::mean a mean = SparseVector.new - array_of_vectors.each { |i| + a.each { |i| i.each_pair { |k,v| mean[k] += v } @@ -144,3 +144,6 @@ def mean_sparse_vector array_of_vectors return mean end + +end + diff --git a/lib/nlp_ruby/ttable.rb b/lib/nlp_ruby/Translation.rb index c0f37be..0c346a4 100644 --- a/lib/nlp_ruby/ttable.rb +++ b/lib/nlp_ruby/Translation.rb @@ -1,32 +1,13 @@ -# table['some French string'] = [Array of English strings] -def read_phrase_table fn - table = {} - f = ReadFile.new fn - while raw_rule = f.gets - french, english, features = splitpipe(raw_rule) - feature_map = read_feature_string(features) - if table.has_key? french - table[french] << [english, feature_map ] - else - table[french] = [[english, feature_map]] - end - end - f.close - return table -end - -# FIXME class Translation attr_accessor :id, :s, :raw, :f, :score, :rank, :other_score - def initialize id=nil, raw=nil, s=nil, f=nil, score=nil, rank=nil, other_score=nil + def initialize id=nil, raw=nil, s=nil, f=nil, scores={}, rank=nil @id = id @raw = raw @s = s @f = f - @score = score + @scores = scores @rank = rank - @other_score = other_score end def from_s t, strip_alignment=true, rank=nil @@ -41,17 +22,17 @@ class Translation end @id = id.to_i @f = read_feature_string features - @score = score.to_f + @scores['decoder'] = score.to_f @rank = rank - @other_score = nil end - def to_s - [@id, @s, @f.to_kv, @score].join ' ||| ' + def to_s include_features=true + [@id, @s, @f.to_kv('=', ' '), @scores['decoder']].join(' ||| ') if include_features + [@id, @s, @scores['decoder']].join(' ||| ') if !include_features end def to_s2 - [@rank, @s, @score, @other_score].join ' ||| ' + [@rank, @s, @score, @scores.to_s].join ' ||| ' end end diff --git a/lib/nlp_ruby/bleu.rb b/lib/nlp_ruby/bleu.rb index ee91985..d7a6b2b 100644 --- a/lib/nlp_ruby/bleu.rb +++ b/lib/nlp_ruby/bleu.rb @@ -79,9 +79,9 @@ def BLEU::get_counts hypothesis, reference, n, times=1 return p end -def BLEU::brevity_penalty c, r +def BLEU::brevity_penalty c, r, hack=0.0 return 1.0 if c>r - return Math.exp(1-r/c) + return Math.exp 1.0-((r+hack)/c) end def BLEU::bleu counts, n, debug=false @@ -105,7 +105,7 @@ def BLEU::hbleu counts, n, debug=false (100*bleu(counts, n, debug)).round(3) end -def BLEU::per_sentence_bleu hypothesis, reference, n=4 +def BLEU::per_sentence_bleu hypothesis, reference, n=4, hack=0.0 h_ng = {}; r_ng = {} (1).upto(n) {|i| h_ng[i] = []; r_ng[i] = []} ngrams(hypothesis, n) {|i| h_ng[i.size] << i} @@ -117,13 +117,13 @@ def BLEU::per_sentence_bleu hypothesis, reference, n=4 (1).upto(m) { |i| counts_clipped = 0 counts_sum = h_ng[i].size - h_ng[i].uniq.each {|j| counts_clipped += r_ng[i].count(j)} + h_ng[i].uniq.each { |j| counts_clipped += r_ng[i].count(j) } add = 1.0 if i >= 2 sum += weight * Math.log((counts_clipped + add)/(counts_sum + add)); - } + } return brevity_penalty(hypothesis.strip.split.size, reference.strip.split.size) * Math.exp(sum) end -end +end # module diff --git a/lib/nlp_ruby/cdec.rb b/lib/nlp_ruby/cdec.rb deleted file mode 100644 index 1080f14..0000000 --- a/lib/nlp_ruby/cdec.rb +++ /dev/null @@ -1,20 +0,0 @@ -module CDEC - -require 'open3' - - -# FIXME -CDEC_BINARY = "/toolbox/cdec-dtrain/decoder/cdec" - - -def CDEC::kbest input, ini, weights, k, unique=true - o, s = Open3.capture2 "echo \"#{input}\" | #{CDEC_BINARY} -c #{ini} -w #{weights} -k #{k} -r 2>/dev/null" - j = -1 - ret = [] - o.split("\n").map{|i| j+=1; t=Translation.new; t.from_s(i, false, j); ret << t} - return ret -end - - -end - diff --git a/lib/nlp_ruby/dags.rb b/lib/nlp_ruby/dag.rb index 7767be1..cca35c5 100644 --- a/lib/nlp_ruby/dags.rb +++ b/lib/nlp_ruby/dag.rb @@ -1,21 +1,8 @@ -########################### -# TODO -# output paths -# visualization? -# algorithms: -# beam search -# best-first -# kbest -# kruskal (MST)? -# transitive closure? -########################### +module DAG require 'json' -module DAG - - class DAG::Node attr_accessor :label, :edges, :incoming, :score, :mark diff --git a/lib/nlp_ruby/misc.rb b/lib/nlp_ruby/misc.rb index 80d932c..0f58100 100644 --- a/lib/nlp_ruby/misc.rb +++ b/lib/nlp_ruby/misc.rb @@ -21,6 +21,40 @@ class Array end end +class String + + def downcase? s + s[/[[:lower:]]/] + end +end + +class PriorityQueue +# This assumes that elements in the queue +# have a numerical member named 'score'. + + def initialize a=Array.new + @queue = Array.new a + sort! + end + + def sort! + @queue.sort_by! { |i| -i.score } + end + + def pop + @queue.pop + end + + def push i + @queue << i + sort! + end + + def empty? + @queue.empty? + end +end + def spawn_with_timeout cmd, t=4, debug=false require 'timeout' STDERR.write cmd+"\n" if debug @@ -37,4 +71,44 @@ def spawn_with_timeout cmd, t=4, debug=false return pipe_in.read end +def read_phrase_table fn + table = {} + f = ReadFile.new fn + while raw_rule = f.gets + french, english, features = splitpipe(raw_rule) + feature_map = read_feature_string(features) + if table.has_key? french + table[french] << [english, feature_map ] + else + table[french] = [[english, feature_map]] + end + end + f.close + return table +end + +def cdec_kbest cdec_bin, input, ini, weights, k, unique=true + require 'open3' + cmd = "echo \"#{input}\" | #{cdec_bin} -c #{ini} -w #{weights} -k #{k}" + cmd += " -r" if unique + o,_ = Open3.capture2 "#{cmd} 2>/dev/null" + a = []; j = -1 + o.split("\n").map{ |i| j+=1; t=Translation.new; t.from_s(i, false, j); a << t } + return a +end + +def read_config fn + f = ReadFile.new fn + cfg = {} + while line = f.gets + line.strip! + next if /^\s*$/.match line + next if line[0]=='#' + content = line.split('#', 2).first + k, v = content.split(/\s*=\s*/, 2) + k.strip!; v.strip! + cfg[k] = v + end + return cfg +end diff --git a/lib/nlp_ruby/semirings.rb b/lib/nlp_ruby/semirings.rb index a06f151..83551a9 100644 --- a/lib/nlp_ruby/semirings.rb +++ b/lib/nlp_ruby/semirings.rb @@ -1,4 +1,5 @@ -# semirings for graphs as described in +# Semirings for directed acyclic graphs (dags) (also directed hypergraphs), +# as described in: # 'Dynamic Programming Algorithms in # Semiring and Hypergraph Frameworks' (Liang Huang) class Semiring diff --git a/lib/nlp_ruby/stringutil.rb b/lib/nlp_ruby/stringutil.rb index d7381bb..aa9be00 100644 --- a/lib/nlp_ruby/stringutil.rb +++ b/lib/nlp_ruby/stringutil.rb @@ -1,17 +1,7 @@ -# whitespace 'tokenizer' def tokenize s s.strip.split end -def splitpipe s, n=3 - s.strip.split("|"*n) -end - -def downcase? s - s[/[[:lower:]]/] -end - -# iterator over n-grams def ngrams(s, n, fix=false) a = tokenize s a.each_with_index { |tok, i| @@ -22,34 +12,11 @@ def ngrams(s, n, fix=false) } end -# a=1.0 b=2.0 => { 'a' => 1.0, 'b' => 2.0 } -def read_feature_string s - map = SparseVector.new - tokenize(s).each { |i| - key, value = i.split '=' - map[key] = value.to_f - } - return map -end - - -def read_cfg fn - f = ReadFile.new fn - cfg = {} - while line = f.gets - line.strip! - next if /^\s*$/.match line - next if line[0]=='#' - content = line.split('#', 2).first - k, v = content.split(/\s*=\s*/, 2) - k.strip!; v.strip! - cfg[k] = v - end - return cfg -end - def bag_of_words s, stopwords=[] s.strip.split.uniq.sort.reject{ |w| stopwords.include? w } -end +end +def splitpipe s, n=3 + s.strip.split("|"*n) +end |