From 8adea055298189643a3c7a76e2d529f536a94e11 Mon Sep 17 00:00:00 2001 From: Patrick Simianer
Date: Tue, 23 Apr 2019 13:55:47 +0200
Subject: init
---
rk.rb | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 127 insertions(+)
create mode 100755 rk.rb
(limited to 'rk.rb')
diff --git a/rk.rb b/rk.rb
new file mode 100755
index 0000000..ccbe90b
--- /dev/null
+++ b/rk.rb
@@ -0,0 +1,127 @@
+#!/usr/bin/env ruby
+
+require_relative 'util'
+
+def setup reference_file=nil, hypotheses_file=nil, stopwords_file=nil
+ references = ReadFile.readlines_strip reference_file
+ hypotheses = ReadFile.readlines_strip hypotheses_file
+ stopwords = read_stopwords_file stopwords_file
+
+ return references, hypotheses, stopwords
+end
+
+def rk references,
+ hypotheses,
+ stopwords,
+ k,
+ combined=false,
+ vocab=nil,
+ rare_threshold=1.0/0,
+ cumulative=false,
+ per_segment=false,
+ occurrences = {}; occurrences.default = 0
+ total = 0
+ enumerator = 0
+ if not combined then cmp = :== else cmp = :<= end # == for R0 and R1, <= for R01
+ hypotheses.each_index { |i| # Inputs are all assumed to be tokenized and truecased
+ r = get_types references[i], stopwords, vocab, rare_threshold
+ h = get_types hypotheses[i], stopwords, vocab, rare_threshold
+ current_total = current_enumerator = 0
+ r.each { |t|
+ occurrences[t] += 1 # Count occurrences
+
+ # Denominator
+ if occurrences[t].public_send(cmp, k+1) # Count exact occurence count, or up to k+1
+ total += 1.0
+ current_total += 1.0
+ end
+
+ # Enumerator
+ if h.include? t # Match!
+ if occurrences[t].public_send(cmp, k+1) # k+1th occurrence, kth-shot
+ enumerator += 1
+ current_enumerator += 1
+ end
+ end
+ }
+
+ if per_segment
+ begin
+ puts current_enumerator/current_total
+ rescue
+ puts 0.0
+ end
+ end
+
+ if cumulative
+ begin
+ puts enumerator/total
+ #puts total
+ rescue
+ puts 0.0
+ end
+ end
+ }
+
+ return enumerator, total
+end
+
+def main
+ config = Trollop::options do
+ opt :input, "File with hypotheses, truecased and tokenized", :type => :string, :short => "-i", :default => '-'
+ opt :references, "File with references, truecased and tokenized", :type => :string, :short => "-r", :required => true
+ opt :stopwords, "File with stopwords, one per line", :type => :string, :short => "-s", :required => true
+ opt :k, "Allow k-shot matches", :type => :int, :short => "-k", :default => 1
+ opt :vocab, "Vocab. file, format: