From 8adea055298189643a3c7a76e2d529f536a94e11 Mon Sep 17 00:00:00 2001 From: Patrick Simianer
Date: Tue, 23 Apr 2019 13:55:47 +0200
Subject: init
---
.gitmodules | 3 ++
README.md | 5 ++
example/README.md | 7 +++
example/hypotheses.txt | 2 +
example/references.txt | 2 +
example/stopwords.txt | 4 ++
rk.rb | 127 +++++++++++++++++++++++++++++++++++++++++++++++++
stats.rb | 38 +++++++++++++++
util.rb | 60 +++++++++++++++++++++++
9 files changed, 248 insertions(+)
create mode 100644 .gitmodules
create mode 100644 README.md
create mode 100644 example/README.md
create mode 100644 example/hypotheses.txt
create mode 100644 example/references.txt
create mode 100644 example/stopwords.txt
create mode 100755 rk.rb
create mode 100755 stats.rb
create mode 100644 util.rb
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..3d7386c
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "stopwords-de"]
+ path = stopwords-de
+ url = https://github.com/stopwords-iso/stopwords-de
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..d418110
--- /dev/null
+++ b/README.md
@@ -0,0 +1,5 @@
+Rk (k={0,1,0+1}) metrics as described in Simianer, Wuebker, and DeNero 2019 [1].
+
+
+[1] https://simianer.de/simianer2019measuring.pdf
+
diff --git a/example/README.md b/example/README.md
new file mode 100644
index 0000000..528fa22
--- /dev/null
+++ b/example/README.md
@@ -0,0 +1,7 @@
+Example from paper.
+
+```
+$ ../rk.rb -r references.txt -i hypotheses.txt -s stopwords.txt -Z -O -C
+R0=50.0 [2/4] R1=100.0 [2/2] R01=66.67 [4/6]
+```
+
diff --git a/example/hypotheses.txt b/example/hypotheses.txt
new file mode 100644
index 0000000..1b76def
--- /dev/null
+++ b/example/hypotheses.txt
@@ -0,0 +1,2 @@
+A terrier bites the person
+The dog bites the man
diff --git a/example/references.txt b/example/references.txt
new file mode 100644
index 0000000..ec8ca27
--- /dev/null
+++ b/example/references.txt
@@ -0,0 +1,2 @@
+The dog bites the lady
+The man bites the dog
diff --git a/example/stopwords.txt b/example/stopwords.txt
new file mode 100644
index 0000000..3677978
--- /dev/null
+++ b/example/stopwords.txt
@@ -0,0 +1,4 @@
+a
+A
+the
+The
diff --git a/rk.rb b/rk.rb
new file mode 100755
index 0000000..ccbe90b
--- /dev/null
+++ b/rk.rb
@@ -0,0 +1,127 @@
+#!/usr/bin/env ruby
+
+require_relative 'util'
+
+def setup reference_file=nil, hypotheses_file=nil, stopwords_file=nil
+ references = ReadFile.readlines_strip reference_file
+ hypotheses = ReadFile.readlines_strip hypotheses_file
+ stopwords = read_stopwords_file stopwords_file
+
+ return references, hypotheses, stopwords
+end
+
+def rk references,
+ hypotheses,
+ stopwords,
+ k,
+ combined=false,
+ vocab=nil,
+ rare_threshold=1.0/0,
+ cumulative=false,
+ per_segment=false,
+ occurrences = {}; occurrences.default = 0
+ total = 0
+ enumerator = 0
+ if not combined then cmp = :== else cmp = :<= end # == for R0 and R1, <= for R01
+ hypotheses.each_index { |i| # Inputs are all assumed to be tokenized and truecased
+ r = get_types references[i], stopwords, vocab, rare_threshold
+ h = get_types hypotheses[i], stopwords, vocab, rare_threshold
+ current_total = current_enumerator = 0
+ r.each { |t|
+ occurrences[t] += 1 # Count occurrences
+
+ # Denominator
+ if occurrences[t].public_send(cmp, k+1) # Count exact occurence count, or up to k+1
+ total += 1.0
+ current_total += 1.0
+ end
+
+ # Enumerator
+ if h.include? t # Match!
+ if occurrences[t].public_send(cmp, k+1) # k+1th occurrence, kth-shot
+ enumerator += 1
+ current_enumerator += 1
+ end
+ end
+ }
+
+ if per_segment
+ begin
+ puts current_enumerator/current_total
+ rescue
+ puts 0.0
+ end
+ end
+
+ if cumulative
+ begin
+ puts enumerator/total
+ #puts total
+ rescue
+ puts 0.0
+ end
+ end
+ }
+
+ return enumerator, total
+end
+
+def main
+ config = Trollop::options do
+ opt :input, "File with hypotheses, truecased and tokenized", :type => :string, :short => "-i", :default => '-'
+ opt :references, "File with references, truecased and tokenized", :type => :string, :short => "-r", :required => true
+ opt :stopwords, "File with stopwords, one per line", :type => :string, :short => "-s", :required => true
+ opt :k, "Allow k-shot matches", :type => :int, :short => "-k", :default => 1
+ opt :vocab, "Vocab. file, format: