summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2014-02-14 17:14:49 +0100
committerPatrick Simianer <p@simianer.de>2014-02-14 17:14:49 +0100
commitc0daa3e70cc3187f04f67c2cdc0bd3b3217e8aa6 (patch)
treef7030f39f20e21148ed817142eee0536e557c0c0
parent7255d33914122e58b031108de49918b8910eebc6 (diff)
=> 0.3; License and README updated; some from_* methods for SparseVector; ttable.rb => Translation.rb; moved some misc. stuff to misc.rb; monkey patched String
-rw-r--r--LICENSE22
-rw-r--r--README.md18
-rwxr-xr-xlib/nlp_ruby.rb6
-rw-r--r--lib/nlp_ruby/PriorityQueue.rb37
-rw-r--r--lib/nlp_ruby/SparseVector.rb71
-rw-r--r--lib/nlp_ruby/Translation.rb (renamed from lib/nlp_ruby/ttable.rb)33
-rw-r--r--lib/nlp_ruby/bleu.rb12
-rw-r--r--lib/nlp_ruby/cdec.rb20
-rw-r--r--lib/nlp_ruby/dag.rb (renamed from lib/nlp_ruby/dags.rb)15
-rw-r--r--lib/nlp_ruby/misc.rb74
-rw-r--r--lib/nlp_ruby/semirings.rb3
-rw-r--r--lib/nlp_ruby/stringutil.rb41
12 files changed, 161 insertions, 191 deletions
diff --git a/LICENSE b/LICENSE
index 0d5dab3..cf4d89e 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,7 +1,23 @@
+The MIT License
+
Copyright (C) 2014 Patrick Simianer <p ät simianer.de>
+http://simianer.de
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
-Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
-The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/README.md b/README.md
index 26858ff..5db1487 100644
--- a/README.md
+++ b/README.md
@@ -4,13 +4,13 @@ nlp_ruby
My little NLP library, supposed to make _my_ work a little easier and less redundant.
The .gem can be found here: https://rubygems.org/gems/nlp_ruby
-* dags.rb : implementation of a directed acyclic graph and various algorithms
-* fileutil.rb : file utilities
-* PriorityQueue.rb : a simple priority queue
-* semirings.rb : semirings for dags.rb
-* SparseVector.rb : sparse vectors for ruby, based on Hash
-* stringutil.rb : string utilities
-* tfidf.rb : functions to calculate tf/ntf/idf
-* ttable.rb : functions to read MT phrase tables
-* Vector.rb : vector class based on Array
+ bleu.rb : BLEU implementation, also per-sentence-BLEU
+ dag.rb : implementation of a directed acyclic graph and various algorithms
+ fileutil.rb : file utilities
+ misc.rb : misc. stuff (e.g. monkey patches for Array and String)
+ semirings.rb : semirings (used in dags.rb)
+ SparseVector.rb : sparse vectors for ruby, based on Hash class
+ stringutil.rb : string utilities
+ tfidf.rb : functions to calculate tf/ntf/idf
+ Translation.rb : an object for
diff --git a/lib/nlp_ruby.rb b/lib/nlp_ruby.rb
index c7af97b..a3f9f1a 100755
--- a/lib/nlp_ruby.rb
+++ b/lib/nlp_ruby.rb
@@ -3,14 +3,12 @@
require 'nlp_ruby/stringutil'
require 'nlp_ruby/fileutil'
require 'nlp_ruby/SparseVector'
-require 'nlp_ruby/PriorityQueue'
require 'nlp_ruby/tfidf'
-require 'nlp_ruby/ttable'
-require 'nlp_ruby/dags'
+require 'nlp_ruby/Translation'
+require 'nlp_ruby/dag'
require 'nlp_ruby/semirings'
require 'nlp_ruby/bleu'
require 'nlp_ruby/misc'
-require 'nlp_ruby/cdec'
STDIN.set_encoding 'utf-8'
STDOUT.set_encoding 'utf-8'
diff --git a/lib/nlp_ruby/PriorityQueue.rb b/lib/nlp_ruby/PriorityQueue.rb
deleted file mode 100644
index f090e60..0000000
--- a/lib/nlp_ruby/PriorityQueue.rb
+++ /dev/null
@@ -1,37 +0,0 @@
-# FIXME dags
-# this assumes that elements in the queue
-# have a numerical member named 'score'
-class PriorityQueue
-
- def initialize a=Array.new
- @queue = Array.new a
- sort!
- end
-
- def sort!
- @queue.sort_by! { |i| -i.score }
- end
-
- def pop
- @queue.pop
- end
-
- def push i
- @queue << i
- sort!
- end
-
- def empty?
- @queue.empty?
- end
-
- # FIXME
- def to_s
- a = []
- @queue.each { |i|
- a << "#{i.to_s}[#{i.score}]"
- }
- "[#{a.join ', '}]"
- end
-end
-
diff --git a/lib/nlp_ruby/SparseVector.rb b/lib/nlp_ruby/SparseVector.rb
index 1c0262b..b80373c 100644
--- a/lib/nlp_ruby/SparseVector.rb
+++ b/lib/nlp_ruby/SparseVector.rb
@@ -20,6 +20,34 @@ class SparseVector < Hash
from_h eval(s)
end
+ def to_kv sep='=', join=' '
+ a = []
+ self.each_pair { |k,v|
+ a << "#{k}#{sep}#{v}"
+ }
+ return a.join join
+ end
+
+ def from_kv s
+ s.split.each { |i|
+ k,v = i.split('=')
+ self[k] = v.to_f
+ }
+ end
+
+ def from_file fn, sep='='
+ f = ReadFile.new(fn)
+ while line = f.gets
+ key, value = line.strip.split sep
+ value = value.to_f
+ self[key] = value
+ end
+ end
+
+ def join_keys other
+ self.keys + other.keys
+ end
+
def sum
self.values.inject(:+)
end
@@ -74,38 +102,6 @@ class SparseVector < Hash
return Math.sqrt(sum)
end
- # FIXME
- def from_kv_file fn, sep=' '
- f = ReadFile.new(fn)
- while line = f.gets
- key, value = line.strip.split sep
- value = value.to_f
- self[key] = value
- end
- end
-
- # FIXME
- def to_kv sep='='
- a = []
- self.each_pair { |k,v|
- a << "#{k}#{sep}#{v}"
- }
- return a.join ' '
- end
-
- # FIXME
- def to_kv2 sep='='
- a = []
- self.each_pair { |k,v|
- a << "#{k}#{sep}#{v}"
- }
- return a.join "\n"
- end
-
- def join_keys other
- self.keys + other.keys
- end
-
def + other
new = SparseVector.new
join_keys(other).each { |k|
@@ -132,9 +128,13 @@ class SparseVector < Hash
end
end
-def mean_sparse_vector array_of_vectors
+
+module SparseVector
+
+
+def SparseVector::mean a
mean = SparseVector.new
- array_of_vectors.each { |i|
+ a.each { |i|
i.each_pair { |k,v|
mean[k] += v
}
@@ -144,3 +144,6 @@ def mean_sparse_vector array_of_vectors
return mean
end
+
+end
+
diff --git a/lib/nlp_ruby/ttable.rb b/lib/nlp_ruby/Translation.rb
index c0f37be..0c346a4 100644
--- a/lib/nlp_ruby/ttable.rb
+++ b/lib/nlp_ruby/Translation.rb
@@ -1,32 +1,13 @@
-# table['some French string'] = [Array of English strings]
-def read_phrase_table fn
- table = {}
- f = ReadFile.new fn
- while raw_rule = f.gets
- french, english, features = splitpipe(raw_rule)
- feature_map = read_feature_string(features)
- if table.has_key? french
- table[french] << [english, feature_map ]
- else
- table[french] = [[english, feature_map]]
- end
- end
- f.close
- return table
-end
-
-# FIXME
class Translation
attr_accessor :id, :s, :raw, :f, :score, :rank, :other_score
- def initialize id=nil, raw=nil, s=nil, f=nil, score=nil, rank=nil, other_score=nil
+ def initialize id=nil, raw=nil, s=nil, f=nil, scores={}, rank=nil
@id = id
@raw = raw
@s = s
@f = f
- @score = score
+ @scores = scores
@rank = rank
- @other_score = other_score
end
def from_s t, strip_alignment=true, rank=nil
@@ -41,17 +22,17 @@ class Translation
end
@id = id.to_i
@f = read_feature_string features
- @score = score.to_f
+ @scores['decoder'] = score.to_f
@rank = rank
- @other_score = nil
end
- def to_s
- [@id, @s, @f.to_kv, @score].join ' ||| '
+ def to_s include_features=true
+ [@id, @s, @f.to_kv('=', ' '), @scores['decoder']].join(' ||| ') if include_features
+ [@id, @s, @scores['decoder']].join(' ||| ') if !include_features
end
def to_s2
- [@rank, @s, @score, @other_score].join ' ||| '
+ [@rank, @s, @score, @scores.to_s].join ' ||| '
end
end
diff --git a/lib/nlp_ruby/bleu.rb b/lib/nlp_ruby/bleu.rb
index ee91985..d7a6b2b 100644
--- a/lib/nlp_ruby/bleu.rb
+++ b/lib/nlp_ruby/bleu.rb
@@ -79,9 +79,9 @@ def BLEU::get_counts hypothesis, reference, n, times=1
return p
end
-def BLEU::brevity_penalty c, r
+def BLEU::brevity_penalty c, r, hack=0.0
return 1.0 if c>r
- return Math.exp(1-r/c)
+ return Math.exp 1.0-((r+hack)/c)
end
def BLEU::bleu counts, n, debug=false
@@ -105,7 +105,7 @@ def BLEU::hbleu counts, n, debug=false
(100*bleu(counts, n, debug)).round(3)
end
-def BLEU::per_sentence_bleu hypothesis, reference, n=4
+def BLEU::per_sentence_bleu hypothesis, reference, n=4, hack=0.0
h_ng = {}; r_ng = {}
(1).upto(n) {|i| h_ng[i] = []; r_ng[i] = []}
ngrams(hypothesis, n) {|i| h_ng[i.size] << i}
@@ -117,13 +117,13 @@ def BLEU::per_sentence_bleu hypothesis, reference, n=4
(1).upto(m) { |i|
counts_clipped = 0
counts_sum = h_ng[i].size
- h_ng[i].uniq.each {|j| counts_clipped += r_ng[i].count(j)}
+ h_ng[i].uniq.each { |j| counts_clipped += r_ng[i].count(j) }
add = 1.0 if i >= 2
sum += weight * Math.log((counts_clipped + add)/(counts_sum + add));
- }
+ }
return brevity_penalty(hypothesis.strip.split.size, reference.strip.split.size) * Math.exp(sum)
end
-end
+end # module
diff --git a/lib/nlp_ruby/cdec.rb b/lib/nlp_ruby/cdec.rb
deleted file mode 100644
index 1080f14..0000000
--- a/lib/nlp_ruby/cdec.rb
+++ /dev/null
@@ -1,20 +0,0 @@
-module CDEC
-
-require 'open3'
-
-
-# FIXME
-CDEC_BINARY = "/toolbox/cdec-dtrain/decoder/cdec"
-
-
-def CDEC::kbest input, ini, weights, k, unique=true
- o, s = Open3.capture2 "echo \"#{input}\" | #{CDEC_BINARY} -c #{ini} -w #{weights} -k #{k} -r 2>/dev/null"
- j = -1
- ret = []
- o.split("\n").map{|i| j+=1; t=Translation.new; t.from_s(i, false, j); ret << t}
- return ret
-end
-
-
-end
-
diff --git a/lib/nlp_ruby/dags.rb b/lib/nlp_ruby/dag.rb
index 7767be1..cca35c5 100644
--- a/lib/nlp_ruby/dags.rb
+++ b/lib/nlp_ruby/dag.rb
@@ -1,21 +1,8 @@
-###########################
-# TODO
-# output paths
-# visualization?
-# algorithms:
-# beam search
-# best-first
-# kbest
-# kruskal (MST)?
-# transitive closure?
-###########################
+module DAG
require 'json'
-module DAG
-
-
class DAG::Node
attr_accessor :label, :edges, :incoming, :score, :mark
diff --git a/lib/nlp_ruby/misc.rb b/lib/nlp_ruby/misc.rb
index 80d932c..0f58100 100644
--- a/lib/nlp_ruby/misc.rb
+++ b/lib/nlp_ruby/misc.rb
@@ -21,6 +21,40 @@ class Array
end
end
+class String
+
+ def downcase? s
+ s[/[[:lower:]]/]
+ end
+end
+
+class PriorityQueue
+# This assumes that elements in the queue
+# have a numerical member named 'score'.
+
+ def initialize a=Array.new
+ @queue = Array.new a
+ sort!
+ end
+
+ def sort!
+ @queue.sort_by! { |i| -i.score }
+ end
+
+ def pop
+ @queue.pop
+ end
+
+ def push i
+ @queue << i
+ sort!
+ end
+
+ def empty?
+ @queue.empty?
+ end
+end
+
def spawn_with_timeout cmd, t=4, debug=false
require 'timeout'
STDERR.write cmd+"\n" if debug
@@ -37,4 +71,44 @@ def spawn_with_timeout cmd, t=4, debug=false
return pipe_in.read
end
+def read_phrase_table fn
+ table = {}
+ f = ReadFile.new fn
+ while raw_rule = f.gets
+ french, english, features = splitpipe(raw_rule)
+ feature_map = read_feature_string(features)
+ if table.has_key? french
+ table[french] << [english, feature_map ]
+ else
+ table[french] = [[english, feature_map]]
+ end
+ end
+ f.close
+ return table
+end
+
+def cdec_kbest cdec_bin, input, ini, weights, k, unique=true
+ require 'open3'
+ cmd = "echo \"#{input}\" | #{cdec_bin} -c #{ini} -w #{weights} -k #{k}"
+ cmd += " -r" if unique
+ o,_ = Open3.capture2 "#{cmd} 2>/dev/null"
+ a = []; j = -1
+ o.split("\n").map{ |i| j+=1; t=Translation.new; t.from_s(i, false, j); a << t }
+ return a
+end
+
+def read_config fn
+ f = ReadFile.new fn
+ cfg = {}
+ while line = f.gets
+ line.strip!
+ next if /^\s*$/.match line
+ next if line[0]=='#'
+ content = line.split('#', 2).first
+ k, v = content.split(/\s*=\s*/, 2)
+ k.strip!; v.strip!
+ cfg[k] = v
+ end
+ return cfg
+end
diff --git a/lib/nlp_ruby/semirings.rb b/lib/nlp_ruby/semirings.rb
index a06f151..83551a9 100644
--- a/lib/nlp_ruby/semirings.rb
+++ b/lib/nlp_ruby/semirings.rb
@@ -1,4 +1,5 @@
-# semirings for graphs as described in
+# Semirings for directed acyclic graphs (dags) (also directed hypergraphs),
+# as described in:
# 'Dynamic Programming Algorithms in
# Semiring and Hypergraph Frameworks' (Liang Huang)
class Semiring
diff --git a/lib/nlp_ruby/stringutil.rb b/lib/nlp_ruby/stringutil.rb
index d7381bb..aa9be00 100644
--- a/lib/nlp_ruby/stringutil.rb
+++ b/lib/nlp_ruby/stringutil.rb
@@ -1,17 +1,7 @@
-# whitespace 'tokenizer'
def tokenize s
s.strip.split
end
-def splitpipe s, n=3
- s.strip.split("|"*n)
-end
-
-def downcase? s
- s[/[[:lower:]]/]
-end
-
-# iterator over n-grams
def ngrams(s, n, fix=false)
a = tokenize s
a.each_with_index { |tok, i|
@@ -22,34 +12,11 @@ def ngrams(s, n, fix=false)
}
end
-# a=1.0 b=2.0 => { 'a' => 1.0, 'b' => 2.0 }
-def read_feature_string s
- map = SparseVector.new
- tokenize(s).each { |i|
- key, value = i.split '='
- map[key] = value.to_f
- }
- return map
-end
-
-
-def read_cfg fn
- f = ReadFile.new fn
- cfg = {}
- while line = f.gets
- line.strip!
- next if /^\s*$/.match line
- next if line[0]=='#'
- content = line.split('#', 2).first
- k, v = content.split(/\s*=\s*/, 2)
- k.strip!; v.strip!
- cfg[k] = v
- end
- return cfg
-end
-
def bag_of_words s, stopwords=[]
s.strip.split.uniq.sort.reject{ |w| stopwords.include? w }
-end
+end
+def splitpipe s, n=3
+ s.strip.split("|"*n)
+end