From 8adea055298189643a3c7a76e2d529f536a94e11 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Tue, 23 Apr 2019 13:55:47 +0200 Subject: init --- .gitmodules | 3 ++ README.md | 5 ++ example/README.md | 7 +++ example/hypotheses.txt | 2 + example/references.txt | 2 + example/stopwords.txt | 4 ++ rk.rb | 127 +++++++++++++++++++++++++++++++++++++++++++++++++ stats.rb | 38 +++++++++++++++ util.rb | 60 +++++++++++++++++++++++ 9 files changed, 248 insertions(+) create mode 100644 .gitmodules create mode 100644 README.md create mode 100644 example/README.md create mode 100644 example/hypotheses.txt create mode 100644 example/references.txt create mode 100644 example/stopwords.txt create mode 100755 rk.rb create mode 100755 stats.rb create mode 100644 util.rb diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..3d7386c --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "stopwords-de"] + path = stopwords-de + url = https://github.com/stopwords-iso/stopwords-de diff --git a/README.md b/README.md new file mode 100644 index 0000000..d418110 --- /dev/null +++ b/README.md @@ -0,0 +1,5 @@ +Rk (k={0,1,0+1}) metrics as described in Simianer, Wuebker, and DeNero 2019 [1]. + + +[1] https://simianer.de/simianer2019measuring.pdf + diff --git a/example/README.md b/example/README.md new file mode 100644 index 0000000..528fa22 --- /dev/null +++ b/example/README.md @@ -0,0 +1,7 @@ +Example from paper. + +``` +$ ../rk.rb -r references.txt -i hypotheses.txt -s stopwords.txt -Z -O -C +R0=50.0 [2/4] R1=100.0 [2/2] R01=66.67 [4/6] +``` + diff --git a/example/hypotheses.txt b/example/hypotheses.txt new file mode 100644 index 0000000..1b76def --- /dev/null +++ b/example/hypotheses.txt @@ -0,0 +1,2 @@ +A terrier bites the person +The dog bites the man diff --git a/example/references.txt b/example/references.txt new file mode 100644 index 0000000..ec8ca27 --- /dev/null +++ b/example/references.txt @@ -0,0 +1,2 @@ +The dog bites the lady +The man bites the dog diff --git a/example/stopwords.txt b/example/stopwords.txt new file mode 100644 index 0000000..3677978 --- /dev/null +++ b/example/stopwords.txt @@ -0,0 +1,4 @@ +a +A +the +The diff --git a/rk.rb b/rk.rb new file mode 100755 index 0000000..ccbe90b --- /dev/null +++ b/rk.rb @@ -0,0 +1,127 @@ +#!/usr/bin/env ruby + +require_relative 'util' + +def setup reference_file=nil, hypotheses_file=nil, stopwords_file=nil + references = ReadFile.readlines_strip reference_file + hypotheses = ReadFile.readlines_strip hypotheses_file + stopwords = read_stopwords_file stopwords_file + + return references, hypotheses, stopwords +end + +def rk references, + hypotheses, + stopwords, + k, + combined=false, + vocab=nil, + rare_threshold=1.0/0, + cumulative=false, + per_segment=false, + occurrences = {}; occurrences.default = 0 + total = 0 + enumerator = 0 + if not combined then cmp = :== else cmp = :<= end # == for R0 and R1, <= for R01 + hypotheses.each_index { |i| # Inputs are all assumed to be tokenized and truecased + r = get_types references[i], stopwords, vocab, rare_threshold + h = get_types hypotheses[i], stopwords, vocab, rare_threshold + current_total = current_enumerator = 0 + r.each { |t| + occurrences[t] += 1 # Count occurrences + + # Denominator + if occurrences[t].public_send(cmp, k+1) # Count exact occurence count, or up to k+1 + total += 1.0 + current_total += 1.0 + end + + # Enumerator + if h.include? t # Match! + if occurrences[t].public_send(cmp, k+1) # k+1th occurrence, kth-shot + enumerator += 1 + current_enumerator += 1 + end + end + } + + if per_segment + begin + puts current_enumerator/current_total + rescue + puts 0.0 + end + end + + if cumulative + begin + puts enumerator/total + #puts total + rescue + puts 0.0 + end + end + } + + return enumerator, total +end + +def main + config = Trollop::options do + opt :input, "File with hypotheses, truecased and tokenized", :type => :string, :short => "-i", :default => '-' + opt :references, "File with references, truecased and tokenized", :type => :string, :short => "-r", :required => true + opt :stopwords, "File with stopwords, one per line", :type => :string, :short => "-s", :required => true + opt :k, "Allow k-shot matches", :type => :int, :short => "-k", :default => 1 + opt :vocab, "Vocab. file, format: ", :type => :string, :short => "-v", :default => nil + opt :rare_threshold, "Max. count up to a word is counted as rare", :type => :int, :short => "-R", :default => 0 + opt :zero_shot, "Zero-shot (R0) metric", :type => :bool, :short => "-Z", :default => false + opt :one_shot, "One-shot (R) metric", :type => :bool, :short => "-O", :default => false + opt :combined, "R0 metric", :type => :bool, :short => "-C", :default => false + opt :cumulative, "Output cumulative scores", :type => :bool, :short => "-c", :default => false + opt :per_segment, "Output per-segment scores", :type => :bool, :short => "-p", :default => false + end + + references, hypotheses, stopwords = setup config[:references], config[:input], config[:stopwords] + + if config[:per_segment] and config[:cumulative] + puts "Won't output both per-segment _and_ cumulative scores, exiting!" + exit + end + + if config[:vocab] + vocab = read_vocab_file config[:vocab] + else + vocab = nil + end + + scores = {} + hits = {} + totals = {} + + ks = [] + if config[:zero_shot] then ks << 0 end + if config[:one_shot] then ks << config[:k] end + + ks.each { |k| + enumerator, total = rk references, hypotheses, stopwords, k, false, vocab, config[:rare_threshold], config[:cumulative], config[:per_segment] + scores["R#{k}"] = enumerator / total + hits["R#{k}"] = enumerator + totals["R#{k}"] = total + } + + if config[:combined] + enumerator, total = rk references, hypotheses, stopwords, config[:k], true, vocab, config[:rare_threshold], config[:cumulative], config[:per_segment] + scores["R0#{config[:k]}"] = enumerator / total + hits["R0#{config[:k]}"] = enumerator + totals["R0#{config[:k]}"] = total + end + + if not config[:cumulative] and not config[:per_segment] + puts scores.map { |name,value| + "#{name}=#{(value*100.0).round 2} [#{hits[name]}/#{totals[name].to_i}]" + }.join "\t" + end +end + +main + diff --git a/stats.rb b/stats.rb new file mode 100755 index 0000000..390cbd9 --- /dev/null +++ b/stats.rb @@ -0,0 +1,38 @@ +#!/usr/bin/env ruby + +require_relative 'util' + +def setup reference_file, stopwords_file + references = ReadFile.readlines_strip reference_file + stopwords = read_stopwords_file stopwords_file + + return references, stopwords +end + +def stats references, stopwords + references.each { |r| + types, uniq_types = get_types r, stopwords + + counts = [] + uniq_types.each { |t| + counts << types.count(t) + } + if counts.size > 0 + puts counts.inject(:+) / counts.size.to_f + end + } +end + +def main + config = Trollop::options do + opt :references, "File with references, truecased and tokenized", :type => :string, :short => "-r", :required => true + opt :stopwords, "File with stopwords, one per line", :type => :string, :short => "-s", :required => true + end + + references, stopwords = setup config[:references], config[:stopwords] + + stats references, stopwords +end + +main + diff --git a/util.rb b/util.rb new file mode 100644 index 0000000..d04f823 --- /dev/null +++ b/util.rb @@ -0,0 +1,60 @@ +require 'zipf' +require 'trollop' + +def read_stopwords_file fn + stopwords = {} + f = ReadFile.new fn + while line = f.gets + stopwords[line.strip] = true + end + + return stopwords +end + +def read_vocab_file fn + if fn.split(".")[-1] == "dbm" + require 'dbm' + return DBM.new fn + else + vocab = {} + f = ReadFile.new fn + while line = f.gets + count, word = line.split + vocab[word] = count.to_i + end + + return vocab + end +end + +# Returns true if string s is only composed of punctuation or brackets +def is_punct s + return s.match(/^[[[:punct:]]\<\>\[\]\{\}\(\)]+$/) +end + +# Returns true if string is all digits +def is_num s + return s.match(/^[[:digit:]]+$/) +end + +# 'Tokenizer' based on spaces +def get_tokens s + return tokenize s +end + +# Returns array of unique tokens and token counts for the string s +def get_types s, stopwords, vocab=nil, rare_threshold=1.0/0 + tokens = get_tokens s + types = tokens.select { |tok| + !stopwords.include?(tok) and not is_punct(tok) and not is_num(tok) + }.uniq + + if vocab + types = types.select { |t| + !vocab.fetch(t, nil) || vocab[t].to_i <= rare_threshold + } + end + + return types +end + -- cgit v1.2.3