summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitmodules3
-rw-r--r--README.md5
-rw-r--r--example/README.md7
-rw-r--r--example/hypotheses.txt2
-rw-r--r--example/references.txt2
-rw-r--r--example/stopwords.txt4
-rwxr-xr-xrk.rb127
-rwxr-xr-xstats.rb38
-rw-r--r--util.rb60
9 files changed, 248 insertions, 0 deletions
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..3d7386c
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "stopwords-de"]
+ path = stopwords-de
+ url = https://github.com/stopwords-iso/stopwords-de
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..d418110
--- /dev/null
+++ b/README.md
@@ -0,0 +1,5 @@
+Rk (k={0,1,0+1}) metrics as described in Simianer, Wuebker, and DeNero 2019 [1].
+
+
+[1] https://simianer.de/simianer2019measuring.pdf
+
diff --git a/example/README.md b/example/README.md
new file mode 100644
index 0000000..528fa22
--- /dev/null
+++ b/example/README.md
@@ -0,0 +1,7 @@
+Example from paper.
+
+```
+$ ../rk.rb -r references.txt -i hypotheses.txt -s stopwords.txt -Z -O -C
+R0=50.0 [2/4] R1=100.0 [2/2] R01=66.67 [4/6]
+```
+
diff --git a/example/hypotheses.txt b/example/hypotheses.txt
new file mode 100644
index 0000000..1b76def
--- /dev/null
+++ b/example/hypotheses.txt
@@ -0,0 +1,2 @@
+A terrier bites the person
+The dog bites the man
diff --git a/example/references.txt b/example/references.txt
new file mode 100644
index 0000000..ec8ca27
--- /dev/null
+++ b/example/references.txt
@@ -0,0 +1,2 @@
+The dog bites the lady
+The man bites the dog
diff --git a/example/stopwords.txt b/example/stopwords.txt
new file mode 100644
index 0000000..3677978
--- /dev/null
+++ b/example/stopwords.txt
@@ -0,0 +1,4 @@
+a
+A
+the
+The
diff --git a/rk.rb b/rk.rb
new file mode 100755
index 0000000..ccbe90b
--- /dev/null
+++ b/rk.rb
@@ -0,0 +1,127 @@
+#!/usr/bin/env ruby
+
+require_relative 'util'
+
+def setup reference_file=nil, hypotheses_file=nil, stopwords_file=nil
+ references = ReadFile.readlines_strip reference_file
+ hypotheses = ReadFile.readlines_strip hypotheses_file
+ stopwords = read_stopwords_file stopwords_file
+
+ return references, hypotheses, stopwords
+end
+
+def rk references,
+ hypotheses,
+ stopwords,
+ k,
+ combined=false,
+ vocab=nil,
+ rare_threshold=1.0/0,
+ cumulative=false,
+ per_segment=false,
+ occurrences = {}; occurrences.default = 0
+ total = 0
+ enumerator = 0
+ if not combined then cmp = :== else cmp = :<= end # == for R0 and R1, <= for R01
+ hypotheses.each_index { |i| # Inputs are all assumed to be tokenized and truecased
+ r = get_types references[i], stopwords, vocab, rare_threshold
+ h = get_types hypotheses[i], stopwords, vocab, rare_threshold
+ current_total = current_enumerator = 0
+ r.each { |t|
+ occurrences[t] += 1 # Count occurrences
+
+ # Denominator
+ if occurrences[t].public_send(cmp, k+1) # Count exact occurence count, or up to k+1
+ total += 1.0
+ current_total += 1.0
+ end
+
+ # Enumerator
+ if h.include? t # Match!
+ if occurrences[t].public_send(cmp, k+1) # k+1th occurrence, kth-shot
+ enumerator += 1
+ current_enumerator += 1
+ end
+ end
+ }
+
+ if per_segment
+ begin
+ puts current_enumerator/current_total
+ rescue
+ puts 0.0
+ end
+ end
+
+ if cumulative
+ begin
+ puts enumerator/total
+ #puts total
+ rescue
+ puts 0.0
+ end
+ end
+ }
+
+ return enumerator, total
+end
+
+def main
+ config = Trollop::options do
+ opt :input, "File with hypotheses, truecased and tokenized", :type => :string, :short => "-i", :default => '-'
+ opt :references, "File with references, truecased and tokenized", :type => :string, :short => "-r", :required => true
+ opt :stopwords, "File with stopwords, one per line", :type => :string, :short => "-s", :required => true
+ opt :k, "Allow k-shot matches", :type => :int, :short => "-k", :default => 1
+ opt :vocab, "Vocab. file, format: <count> <word>", :type => :string, :short => "-v", :default => nil
+ opt :rare_threshold, "Max. count up to a word is counted as rare", :type => :int, :short => "-R", :default => 0
+ opt :zero_shot, "Zero-shot (R0) metric", :type => :bool, :short => "-Z", :default => false
+ opt :one_shot, "One-shot (R<k>) metric", :type => :bool, :short => "-O", :default => false
+ opt :combined, "R0<k> metric", :type => :bool, :short => "-C", :default => false
+ opt :cumulative, "Output cumulative scores", :type => :bool, :short => "-c", :default => false
+ opt :per_segment, "Output per-segment scores", :type => :bool, :short => "-p", :default => false
+ end
+
+ references, hypotheses, stopwords = setup config[:references], config[:input], config[:stopwords]
+
+ if config[:per_segment] and config[:cumulative]
+ puts "Won't output both per-segment _and_ cumulative scores, exiting!"
+ exit
+ end
+
+ if config[:vocab]
+ vocab = read_vocab_file config[:vocab]
+ else
+ vocab = nil
+ end
+
+ scores = {}
+ hits = {}
+ totals = {}
+
+ ks = []
+ if config[:zero_shot] then ks << 0 end
+ if config[:one_shot] then ks << config[:k] end
+
+ ks.each { |k|
+ enumerator, total = rk references, hypotheses, stopwords, k, false, vocab, config[:rare_threshold], config[:cumulative], config[:per_segment]
+ scores["R#{k}"] = enumerator / total
+ hits["R#{k}"] = enumerator
+ totals["R#{k}"] = total
+ }
+
+ if config[:combined]
+ enumerator, total = rk references, hypotheses, stopwords, config[:k], true, vocab, config[:rare_threshold], config[:cumulative], config[:per_segment]
+ scores["R0#{config[:k]}"] = enumerator / total
+ hits["R0#{config[:k]}"] = enumerator
+ totals["R0#{config[:k]}"] = total
+ end
+
+ if not config[:cumulative] and not config[:per_segment]
+ puts scores.map { |name,value|
+ "#{name}=#{(value*100.0).round 2} [#{hits[name]}/#{totals[name].to_i}]"
+ }.join "\t"
+ end
+end
+
+main
+
diff --git a/stats.rb b/stats.rb
new file mode 100755
index 0000000..390cbd9
--- /dev/null
+++ b/stats.rb
@@ -0,0 +1,38 @@
+#!/usr/bin/env ruby
+
+require_relative 'util'
+
+def setup reference_file, stopwords_file
+ references = ReadFile.readlines_strip reference_file
+ stopwords = read_stopwords_file stopwords_file
+
+ return references, stopwords
+end
+
+def stats references, stopwords
+ references.each { |r|
+ types, uniq_types = get_types r, stopwords
+
+ counts = []
+ uniq_types.each { |t|
+ counts << types.count(t)
+ }
+ if counts.size > 0
+ puts counts.inject(:+) / counts.size.to_f
+ end
+ }
+end
+
+def main
+ config = Trollop::options do
+ opt :references, "File with references, truecased and tokenized", :type => :string, :short => "-r", :required => true
+ opt :stopwords, "File with stopwords, one per line", :type => :string, :short => "-s", :required => true
+ end
+
+ references, stopwords = setup config[:references], config[:stopwords]
+
+ stats references, stopwords
+end
+
+main
+
diff --git a/util.rb b/util.rb
new file mode 100644
index 0000000..d04f823
--- /dev/null
+++ b/util.rb
@@ -0,0 +1,60 @@
+require 'zipf'
+require 'trollop'
+
+def read_stopwords_file fn
+ stopwords = {}
+ f = ReadFile.new fn
+ while line = f.gets
+ stopwords[line.strip] = true
+ end
+
+ return stopwords
+end
+
+def read_vocab_file fn
+ if fn.split(".")[-1] == "dbm"
+ require 'dbm'
+ return DBM.new fn
+ else
+ vocab = {}
+ f = ReadFile.new fn
+ while line = f.gets
+ count, word = line.split
+ vocab[word] = count.to_i
+ end
+
+ return vocab
+ end
+end
+
+# Returns true if string s is only composed of punctuation or brackets
+def is_punct s
+ return s.match(/^[[[:punct:]]\<\>\[\]\{\}\(\)]+$/)
+end
+
+# Returns true if string is all digits
+def is_num s
+ return s.match(/^[[:digit:]]+$/)
+end
+
+# 'Tokenizer' based on spaces
+def get_tokens s
+ return tokenize s
+end
+
+# Returns array of unique tokens and token counts for the string s
+def get_types s, stopwords, vocab=nil, rare_threshold=1.0/0
+ tokens = get_tokens s
+ types = tokens.select { |tok|
+ !stopwords.include?(tok) and not is_punct(tok) and not is_num(tok)
+ }.uniq
+
+ if vocab
+ types = types.select { |t|
+ !vocab.fetch(t, nil) || vocab[t].to_i <= rare_threshold
+ }
+ end
+
+ return types
+end
+