summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
Diffstat (limited to 'lib')
-rwxr-xr-xlib/nlp_ruby.rb6
-rw-r--r--lib/nlp_ruby/PriorityQueue.rb37
-rw-r--r--lib/nlp_ruby/SparseVector.rb71
-rw-r--r--lib/nlp_ruby/Translation.rb (renamed from lib/nlp_ruby/ttable.rb)33
-rw-r--r--lib/nlp_ruby/bleu.rb12
-rw-r--r--lib/nlp_ruby/cdec.rb20
-rw-r--r--lib/nlp_ruby/dag.rb (renamed from lib/nlp_ruby/dags.rb)15
-rw-r--r--lib/nlp_ruby/misc.rb74
-rw-r--r--lib/nlp_ruby/semirings.rb3
-rw-r--r--lib/nlp_ruby/stringutil.rb41
10 files changed, 133 insertions, 179 deletions
diff --git a/lib/nlp_ruby.rb b/lib/nlp_ruby.rb
index c7af97b..a3f9f1a 100755
--- a/lib/nlp_ruby.rb
+++ b/lib/nlp_ruby.rb
@@ -3,14 +3,12 @@
require 'nlp_ruby/stringutil'
require 'nlp_ruby/fileutil'
require 'nlp_ruby/SparseVector'
-require 'nlp_ruby/PriorityQueue'
require 'nlp_ruby/tfidf'
-require 'nlp_ruby/ttable'
-require 'nlp_ruby/dags'
+require 'nlp_ruby/Translation'
+require 'nlp_ruby/dag'
require 'nlp_ruby/semirings'
require 'nlp_ruby/bleu'
require 'nlp_ruby/misc'
-require 'nlp_ruby/cdec'
STDIN.set_encoding 'utf-8'
STDOUT.set_encoding 'utf-8'
diff --git a/lib/nlp_ruby/PriorityQueue.rb b/lib/nlp_ruby/PriorityQueue.rb
deleted file mode 100644
index f090e60..0000000
--- a/lib/nlp_ruby/PriorityQueue.rb
+++ /dev/null
@@ -1,37 +0,0 @@
-# FIXME dags
-# this assumes that elements in the queue
-# have a numerical member named 'score'
-class PriorityQueue
-
- def initialize a=Array.new
- @queue = Array.new a
- sort!
- end
-
- def sort!
- @queue.sort_by! { |i| -i.score }
- end
-
- def pop
- @queue.pop
- end
-
- def push i
- @queue << i
- sort!
- end
-
- def empty?
- @queue.empty?
- end
-
- # FIXME
- def to_s
- a = []
- @queue.each { |i|
- a << "#{i.to_s}[#{i.score}]"
- }
- "[#{a.join ', '}]"
- end
-end
-
diff --git a/lib/nlp_ruby/SparseVector.rb b/lib/nlp_ruby/SparseVector.rb
index 1c0262b..b80373c 100644
--- a/lib/nlp_ruby/SparseVector.rb
+++ b/lib/nlp_ruby/SparseVector.rb
@@ -20,6 +20,34 @@ class SparseVector < Hash
from_h eval(s)
end
+ def to_kv sep='=', join=' '
+ a = []
+ self.each_pair { |k,v|
+ a << "#{k}#{sep}#{v}"
+ }
+ return a.join join
+ end
+
+ def from_kv s
+ s.split.each { |i|
+ k,v = i.split('=')
+ self[k] = v.to_f
+ }
+ end
+
+ def from_file fn, sep='='
+ f = ReadFile.new(fn)
+ while line = f.gets
+ key, value = line.strip.split sep
+ value = value.to_f
+ self[key] = value
+ end
+ end
+
+ def join_keys other
+ self.keys + other.keys
+ end
+
def sum
self.values.inject(:+)
end
@@ -74,38 +102,6 @@ class SparseVector < Hash
return Math.sqrt(sum)
end
- # FIXME
- def from_kv_file fn, sep=' '
- f = ReadFile.new(fn)
- while line = f.gets
- key, value = line.strip.split sep
- value = value.to_f
- self[key] = value
- end
- end
-
- # FIXME
- def to_kv sep='='
- a = []
- self.each_pair { |k,v|
- a << "#{k}#{sep}#{v}"
- }
- return a.join ' '
- end
-
- # FIXME
- def to_kv2 sep='='
- a = []
- self.each_pair { |k,v|
- a << "#{k}#{sep}#{v}"
- }
- return a.join "\n"
- end
-
- def join_keys other
- self.keys + other.keys
- end
-
def + other
new = SparseVector.new
join_keys(other).each { |k|
@@ -132,9 +128,13 @@ class SparseVector < Hash
end
end
-def mean_sparse_vector array_of_vectors
+
+module SparseVector
+
+
+def SparseVector::mean a
mean = SparseVector.new
- array_of_vectors.each { |i|
+ a.each { |i|
i.each_pair { |k,v|
mean[k] += v
}
@@ -144,3 +144,6 @@ def mean_sparse_vector array_of_vectors
return mean
end
+
+end
+
diff --git a/lib/nlp_ruby/ttable.rb b/lib/nlp_ruby/Translation.rb
index c0f37be..0c346a4 100644
--- a/lib/nlp_ruby/ttable.rb
+++ b/lib/nlp_ruby/Translation.rb
@@ -1,32 +1,13 @@
-# table['some French string'] = [Array of English strings]
-def read_phrase_table fn
- table = {}
- f = ReadFile.new fn
- while raw_rule = f.gets
- french, english, features = splitpipe(raw_rule)
- feature_map = read_feature_string(features)
- if table.has_key? french
- table[french] << [english, feature_map ]
- else
- table[french] = [[english, feature_map]]
- end
- end
- f.close
- return table
-end
-
-# FIXME
class Translation
attr_accessor :id, :s, :raw, :f, :score, :rank, :other_score
- def initialize id=nil, raw=nil, s=nil, f=nil, score=nil, rank=nil, other_score=nil
+ def initialize id=nil, raw=nil, s=nil, f=nil, scores={}, rank=nil
@id = id
@raw = raw
@s = s
@f = f
- @score = score
+ @scores = scores
@rank = rank
- @other_score = other_score
end
def from_s t, strip_alignment=true, rank=nil
@@ -41,17 +22,17 @@ class Translation
end
@id = id.to_i
@f = read_feature_string features
- @score = score.to_f
+ @scores['decoder'] = score.to_f
@rank = rank
- @other_score = nil
end
- def to_s
- [@id, @s, @f.to_kv, @score].join ' ||| '
+ def to_s include_features=true
+ [@id, @s, @f.to_kv('=', ' '), @scores['decoder']].join(' ||| ') if include_features
+ [@id, @s, @scores['decoder']].join(' ||| ') if !include_features
end
def to_s2
- [@rank, @s, @score, @other_score].join ' ||| '
+ [@rank, @s, @score, @scores.to_s].join ' ||| '
end
end
diff --git a/lib/nlp_ruby/bleu.rb b/lib/nlp_ruby/bleu.rb
index ee91985..d7a6b2b 100644
--- a/lib/nlp_ruby/bleu.rb
+++ b/lib/nlp_ruby/bleu.rb
@@ -79,9 +79,9 @@ def BLEU::get_counts hypothesis, reference, n, times=1
return p
end
-def BLEU::brevity_penalty c, r
+def BLEU::brevity_penalty c, r, hack=0.0
return 1.0 if c>r
- return Math.exp(1-r/c)
+ return Math.exp 1.0-((r+hack)/c)
end
def BLEU::bleu counts, n, debug=false
@@ -105,7 +105,7 @@ def BLEU::hbleu counts, n, debug=false
(100*bleu(counts, n, debug)).round(3)
end
-def BLEU::per_sentence_bleu hypothesis, reference, n=4
+def BLEU::per_sentence_bleu hypothesis, reference, n=4, hack=0.0
h_ng = {}; r_ng = {}
(1).upto(n) {|i| h_ng[i] = []; r_ng[i] = []}
ngrams(hypothesis, n) {|i| h_ng[i.size] << i}
@@ -117,13 +117,13 @@ def BLEU::per_sentence_bleu hypothesis, reference, n=4
(1).upto(m) { |i|
counts_clipped = 0
counts_sum = h_ng[i].size
- h_ng[i].uniq.each {|j| counts_clipped += r_ng[i].count(j)}
+ h_ng[i].uniq.each { |j| counts_clipped += r_ng[i].count(j) }
add = 1.0 if i >= 2
sum += weight * Math.log((counts_clipped + add)/(counts_sum + add));
- }
+ }
return brevity_penalty(hypothesis.strip.split.size, reference.strip.split.size) * Math.exp(sum)
end
-end
+end # module
diff --git a/lib/nlp_ruby/cdec.rb b/lib/nlp_ruby/cdec.rb
deleted file mode 100644
index 1080f14..0000000
--- a/lib/nlp_ruby/cdec.rb
+++ /dev/null
@@ -1,20 +0,0 @@
-module CDEC
-
-require 'open3'
-
-
-# FIXME
-CDEC_BINARY = "/toolbox/cdec-dtrain/decoder/cdec"
-
-
-def CDEC::kbest input, ini, weights, k, unique=true
- o, s = Open3.capture2 "echo \"#{input}\" | #{CDEC_BINARY} -c #{ini} -w #{weights} -k #{k} -r 2>/dev/null"
- j = -1
- ret = []
- o.split("\n").map{|i| j+=1; t=Translation.new; t.from_s(i, false, j); ret << t}
- return ret
-end
-
-
-end
-
diff --git a/lib/nlp_ruby/dags.rb b/lib/nlp_ruby/dag.rb
index 7767be1..cca35c5 100644
--- a/lib/nlp_ruby/dags.rb
+++ b/lib/nlp_ruby/dag.rb
@@ -1,21 +1,8 @@
-###########################
-# TODO
-# output paths
-# visualization?
-# algorithms:
-# beam search
-# best-first
-# kbest
-# kruskal (MST)?
-# transitive closure?
-###########################
+module DAG
require 'json'
-module DAG
-
-
class DAG::Node
attr_accessor :label, :edges, :incoming, :score, :mark
diff --git a/lib/nlp_ruby/misc.rb b/lib/nlp_ruby/misc.rb
index 80d932c..0f58100 100644
--- a/lib/nlp_ruby/misc.rb
+++ b/lib/nlp_ruby/misc.rb
@@ -21,6 +21,40 @@ class Array
end
end
+class String
+
+ def downcase? s
+ s[/[[:lower:]]/]
+ end
+end
+
+class PriorityQueue
+# This assumes that elements in the queue
+# have a numerical member named 'score'.
+
+ def initialize a=Array.new
+ @queue = Array.new a
+ sort!
+ end
+
+ def sort!
+ @queue.sort_by! { |i| -i.score }
+ end
+
+ def pop
+ @queue.pop
+ end
+
+ def push i
+ @queue << i
+ sort!
+ end
+
+ def empty?
+ @queue.empty?
+ end
+end
+
def spawn_with_timeout cmd, t=4, debug=false
require 'timeout'
STDERR.write cmd+"\n" if debug
@@ -37,4 +71,44 @@ def spawn_with_timeout cmd, t=4, debug=false
return pipe_in.read
end
+def read_phrase_table fn
+ table = {}
+ f = ReadFile.new fn
+ while raw_rule = f.gets
+ french, english, features = splitpipe(raw_rule)
+ feature_map = read_feature_string(features)
+ if table.has_key? french
+ table[french] << [english, feature_map ]
+ else
+ table[french] = [[english, feature_map]]
+ end
+ end
+ f.close
+ return table
+end
+
+def cdec_kbest cdec_bin, input, ini, weights, k, unique=true
+ require 'open3'
+ cmd = "echo \"#{input}\" | #{cdec_bin} -c #{ini} -w #{weights} -k #{k}"
+ cmd += " -r" if unique
+ o,_ = Open3.capture2 "#{cmd} 2>/dev/null"
+ a = []; j = -1
+ o.split("\n").map{ |i| j+=1; t=Translation.new; t.from_s(i, false, j); a << t }
+ return a
+end
+
+def read_config fn
+ f = ReadFile.new fn
+ cfg = {}
+ while line = f.gets
+ line.strip!
+ next if /^\s*$/.match line
+ next if line[0]=='#'
+ content = line.split('#', 2).first
+ k, v = content.split(/\s*=\s*/, 2)
+ k.strip!; v.strip!
+ cfg[k] = v
+ end
+ return cfg
+end
diff --git a/lib/nlp_ruby/semirings.rb b/lib/nlp_ruby/semirings.rb
index a06f151..83551a9 100644
--- a/lib/nlp_ruby/semirings.rb
+++ b/lib/nlp_ruby/semirings.rb
@@ -1,4 +1,5 @@
-# semirings for graphs as described in
+# Semirings for directed acyclic graphs (dags) (also directed hypergraphs),
+# as described in:
# 'Dynamic Programming Algorithms in
# Semiring and Hypergraph Frameworks' (Liang Huang)
class Semiring
diff --git a/lib/nlp_ruby/stringutil.rb b/lib/nlp_ruby/stringutil.rb
index d7381bb..aa9be00 100644
--- a/lib/nlp_ruby/stringutil.rb
+++ b/lib/nlp_ruby/stringutil.rb
@@ -1,17 +1,7 @@
-# whitespace 'tokenizer'
def tokenize s
s.strip.split
end
-def splitpipe s, n=3
- s.strip.split("|"*n)
-end
-
-def downcase? s
- s[/[[:lower:]]/]
-end
-
-# iterator over n-grams
def ngrams(s, n, fix=false)
a = tokenize s
a.each_with_index { |tok, i|
@@ -22,34 +12,11 @@ def ngrams(s, n, fix=false)
}
end
-# a=1.0 b=2.0 => { 'a' => 1.0, 'b' => 2.0 }
-def read_feature_string s
- map = SparseVector.new
- tokenize(s).each { |i|
- key, value = i.split '='
- map[key] = value.to_f
- }
- return map
-end
-
-
-def read_cfg fn
- f = ReadFile.new fn
- cfg = {}
- while line = f.gets
- line.strip!
- next if /^\s*$/.match line
- next if line[0]=='#'
- content = line.split('#', 2).first
- k, v = content.split(/\s*=\s*/, 2)
- k.strip!; v.strip!
- cfg[k] = v
- end
- return cfg
-end
-
def bag_of_words s, stopwords=[]
s.strip.split.uniq.sort.reject{ |w| stopwords.include? w }
-end
+end
+def splitpipe s, n=3
+ s.strip.split("|"*n)
+end