diff options
-rwxr-xr-x | lib/nlp_ruby.rb | 1 | ||||
-rw-r--r-- | lib/nlp_ruby/SparseVector.rb | 15 | ||||
-rw-r--r-- | lib/nlp_ruby/bleu.rb | 25 | ||||
-rw-r--r-- | lib/nlp_ruby/misc.rb | 26 | ||||
-rw-r--r-- | lib/nlp_ruby/stringutil.rb | 5 | ||||
-rw-r--r-- | lib/nlp_ruby/ttable.rb | 17 |
6 files changed, 80 insertions, 9 deletions
diff --git a/lib/nlp_ruby.rb b/lib/nlp_ruby.rb index 212c367..f0242af 100755 --- a/lib/nlp_ruby.rb +++ b/lib/nlp_ruby.rb @@ -11,6 +11,7 @@ require 'nlp_ruby/dags' require 'nlp_ruby/semirings' require 'nlp_ruby/bleu' require 'nlp_ruby/misc' +require 'nlp_ruby/cdec' STDIN.set_encoding 'utf-8' STDOUT.set_encoding 'utf-8' diff --git a/lib/nlp_ruby/SparseVector.rb b/lib/nlp_ruby/SparseVector.rb index 9919a65..964ef4e 100644 --- a/lib/nlp_ruby/SparseVector.rb +++ b/lib/nlp_ruby/SparseVector.rb @@ -53,10 +53,21 @@ class SparseVector < Hash return Math.sqrt(sum) end - def to_kv + # FIXME + def from_kv_file fn, sep=' ' + f = ReadFile.new(fn) + while line = f.gets + key, value = line.strip.split sep + value = value.to_f + self[key] = value + end + end + + # FIXME + def to_kv sep='=' a = [] self.each_pair { |k,v| - a << "#{k}=#{v}" + a << "#{k}#{sep}#{v}" } return a.join ' ' end diff --git a/lib/nlp_ruby/bleu.rb b/lib/nlp_ruby/bleu.rb index 42be45e..ee91985 100644 --- a/lib/nlp_ruby/bleu.rb +++ b/lib/nlp_ruby/bleu.rb @@ -79,12 +79,12 @@ def BLEU::get_counts hypothesis, reference, n, times=1 return p end -def BLEU::brevity_penalty(c, r) - if c > r then return 1.0 end +def BLEU::brevity_penalty c, r + return 1.0 if c>r return Math.exp(1-r/c) end -def BLEU::bleu(counts, n, debug=false) +def BLEU::bleu counts, n, debug=false corpus_stats = NgramCounts.new n counts.each { |i| corpus_stats.plus_eq i } sum = 0.0 @@ -105,6 +105,25 @@ def BLEU::hbleu counts, n, debug=false (100*bleu(counts, n, debug)).round(3) end +def BLEU::per_sentence_bleu hypothesis, reference, n=4 + h_ng = {}; r_ng = {} + (1).upto(n) {|i| h_ng[i] = []; r_ng[i] = []} + ngrams(hypothesis, n) {|i| h_ng[i.size] << i} + ngrams(reference, n) {|i| r_ng[i.size] << i} + m = [n, reference.split.size].min + weight = 1.0/m + add = 0.0 + sum = 0 + (1).upto(m) { |i| + counts_clipped = 0 + counts_sum = h_ng[i].size + h_ng[i].uniq.each {|j| counts_clipped += r_ng[i].count(j)} + add = 1.0 if i >= 2 + sum += weight * Math.log((counts_clipped + add)/(counts_sum + add)); + } + return brevity_penalty(hypothesis.strip.split.size, reference.strip.split.size) * Math.exp(sum) +end + end diff --git a/lib/nlp_ruby/misc.rb b/lib/nlp_ruby/misc.rb index 9a4064f..1fa3878 100644 --- a/lib/nlp_ruby/misc.rb +++ b/lib/nlp_ruby/misc.rb @@ -2,5 +2,31 @@ class Array def max_index self.index(self.max) end + + def is_subset_of? other + self.each { |i| + if other.include? i + return false + end + } + return true + end +end + +def spawn_with_timeout cmd, t=4, debug=false + require 'timeout' + STDERR.write cmd+"\n" if debug + pipe_in, pipe_out = IO.pipe + pid = Process.spawn(cmd, :out => pipe_out) + begin + Timeout.timeout(t) { Process.wait pid } + rescue Timeout::Error + return "" + # accept the zombies + #Process.kill('TERM', pid) + end + pipe_out.close + return pipe_in.read end + diff --git a/lib/nlp_ruby/stringutil.rb b/lib/nlp_ruby/stringutil.rb index 4091994..d7381bb 100644 --- a/lib/nlp_ruby/stringutil.rb +++ b/lib/nlp_ruby/stringutil.rb @@ -48,3 +48,8 @@ def read_cfg fn return cfg end +def bag_of_words s, stopwords=[] + s.strip.split.uniq.sort.reject{ |w| stopwords.include? w } +end + + diff --git a/lib/nlp_ruby/ttable.rb b/lib/nlp_ruby/ttable.rb index 598e318..14d6c5d 100644 --- a/lib/nlp_ruby/ttable.rb +++ b/lib/nlp_ruby/ttable.rb @@ -15,18 +15,21 @@ def read_phrase_table fn return table end +# FIXME class Translation - attr_accessor :id, :s, :raw, :f, :score + attr_accessor :id, :s, :raw, :f, :score, :rank, :other_score - def initialize id=nil, raw=nil, s=nil, f=nil, score=nil + def initialize id=nil, raw=nil, s=nil, f=nil, score=nil, rank=nil, other_score=nil @id = id @raw = raw @s = s @f = f @score = score + @rank = rank + @other_score = other_score end - def from_s t, strip_alignment=true + def from_s t, strip_alignment=true, rank=nil id, raw, features, score = splitpipe(t, 3) raw.strip! @raw = raw @@ -39,10 +42,16 @@ class Translation @id = id.to_i @f = read_feature_string features @score = score.to_f + @rank = rank + @other_score = nil end def to_s - [id, s, f.to_kv, score].join ' ||| ' + [@id, @s, @f.to_kv, @score].join ' ||| ' + end + + def to_s2 + [@rank, @s, @f.to_kv, @score, @other_score].join ' ||| ' end end |