init

author: Patrick Simianer <p@simianer.de> 2019-04-23 13:55:47 +0200
committer: Patrick Simianer <p@simianer.de> 2019-04-23 13:55:47 +0200
commit: 8adea055298189643a3c7a76e2d529f536a94e11 (patch)
tree: 59d40a9ae0ea9ac81e9f39aeaa082bab3c7c29bc
9 files changed, 248 insertions, 0 deletions
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..3d7386c
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "stopwords-de"]
+	path = stopwords-de
+	url = https://github.com/stopwords-iso/stopwords-de
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..d418110
--- /dev/null
+++ b/README.md
@@ -0,0 +1,5 @@
+Rk (k={0,1,0+1}) metrics as described in Simianer, Wuebker, and DeNero 2019 [1].
+
+
+[1] https://simianer.de/simianer2019measuring.pdf
+
diff --git a/example/README.md b/example/README.md
new file mode 100644
index 0000000..528fa22
--- /dev/null
+++ b/example/README.md
@@ -0,0 +1,7 @@
+Example from paper.
+
+```
+$ ../rk.rb -r references.txt -i hypotheses.txt -s stopwords.txt -Z -O -C
+R0=50.0 [2/4]	R1=100.0 [2/2]	R01=66.67 [4/6]
+```
+
diff --git a/example/hypotheses.txt b/example/hypotheses.txt
new file mode 100644
index 0000000..1b76def
--- /dev/null
+++ b/example/hypotheses.txt
@@ -0,0 +1,2 @@
+A terrier bites the person
+The dog bites the man
diff --git a/example/references.txt b/example/references.txt
new file mode 100644
index 0000000..ec8ca27
--- /dev/null
+++ b/example/references.txt
@@ -0,0 +1,2 @@
+The dog bites the lady
+The man bites the dog
diff --git a/example/stopwords.txt b/example/stopwords.txt
new file mode 100644
index 0000000..3677978
--- /dev/null
+++ b/example/stopwords.txt
@@ -0,0 +1,4 @@
+a
+A
+the
+The
diff --git a/rk.rb b/rk.rb
new file mode 100755
index 0000000..ccbe90b
--- /dev/null
+++ b/rk.rb
@@ -0,0 +1,127 @@
+#!/usr/bin/env ruby
+
+require_relative  'util'
+
+def setup reference_file=nil, hypotheses_file=nil, stopwords_file=nil
+  references = ReadFile.readlines_strip reference_file
+  hypotheses = ReadFile.readlines_strip hypotheses_file
+  stopwords  = read_stopwords_file stopwords_file
+
+  return references, hypotheses, stopwords
+end
+
+def rk references,
+       hypotheses,
+       stopwords,
+       k,
+       combined=false,
+       vocab=nil,
+       rare_threshold=1.0/0,
+       cumulative=false,
+       per_segment=false,
+  occurrences = {}; occurrences.default = 0
+  total = 0
+  enumerator = 0
+  if not combined then cmp = :== else cmp = :<= end  # == for R0 and R1, <= for R01
+  hypotheses.each_index { |i|                        # Inputs are all assumed to be tokenized and truecased
+    r = get_types references[i], stopwords, vocab, rare_threshold
+    h = get_types hypotheses[i], stopwords, vocab, rare_threshold
+    current_total = current_enumerator = 0
+    r.each { |t|
+      occurrences[t] += 1                            # Count occurrences
+
+      # Denominator
+      if occurrences[t].public_send(cmp, k+1)        # Count exact occurence count, or up to k+1
+        total += 1.0
+        current_total += 1.0
+      end
+
+      # Enumerator
+      if h.include? t                                # Match!
+        if occurrences[t].public_send(cmp, k+1)      # k+1th occurrence, kth-shot
+          enumerator += 1
+          current_enumerator += 1
+        end
+      end
+    }
+
+    if per_segment
+      begin
+        puts current_enumerator/current_total
+      rescue
+        puts 0.0
+      end
+    end
+
+    if cumulative
+      begin
+        puts enumerator/total
+        #puts total
+      rescue
+        puts 0.0
+      end
+    end
+  }
+
+  return enumerator, total
+end
+
+def main
+  config = Trollop::options do
+    opt :input, "File with hypotheses, truecased and tokenized",       :type => :string, :short => "-i", :default => '-'
+    opt :references, "File with references, truecased and tokenized",  :type => :string, :short => "-r", :required => true
+    opt :stopwords, "File with stopwords, one per line",               :type => :string, :short => "-s", :required => true
+    opt :k, "Allow k-shot matches",                                    :type => :int,    :short => "-k", :default => 1
+    opt :vocab, "Vocab. file, format: <count> <word>",                 :type => :string, :short => "-v", :default => nil
+    opt :rare_threshold, "Max. count up to a word is counted as rare", :type => :int,    :short => "-R", :default => 0
+    opt :zero_shot, "Zero-shot (R0) metric",                           :type => :bool,   :short => "-Z", :default => false
+    opt :one_shot, "One-shot (R<k>) metric",                           :type => :bool,   :short => "-O", :default => false
+    opt :combined, "R0<k> metric",                                     :type => :bool,   :short => "-C", :default => false
+    opt :cumulative, "Output cumulative scores",                       :type => :bool,   :short => "-c", :default => false
+    opt :per_segment, "Output per-segment scores",                     :type => :bool,   :short => "-p", :default => false
+  end
+
+  references, hypotheses, stopwords = setup config[:references], config[:input], config[:stopwords]
+
+  if config[:per_segment] and config[:cumulative]
+    puts "Won't output both per-segment _and_ cumulative scores, exiting!"
+    exit
+  end
+
+  if config[:vocab]
+    vocab = read_vocab_file config[:vocab]
+  else
+    vocab = nil
+  end
+
+  scores = {}
+  hits = {}
+  totals = {}
+
+  ks = []
+  if config[:zero_shot] then ks << 0 end
+  if config[:one_shot] then ks << config[:k] end
+
+  ks.each { |k|
+    enumerator, total  = rk references, hypotheses, stopwords, k, false, vocab, config[:rare_threshold], config[:cumulative], config[:per_segment]
+    scores["R#{k}"] = enumerator / total
+    hits["R#{k}"] = enumerator
+    totals["R#{k}"] = total
+  }
+
+  if config[:combined]
+    enumerator, total = rk references, hypotheses, stopwords, config[:k], true, vocab, config[:rare_threshold], config[:cumulative], config[:per_segment]
+    scores["R0#{config[:k]}"] = enumerator / total
+    hits["R0#{config[:k]}"] = enumerator
+    totals["R0#{config[:k]}"] = total
+  end
+
+  if not config[:cumulative] and not config[:per_segment]
+    puts scores.map { |name,value|
+      "#{name}=#{(value*100.0).round 2} [#{hits[name]}/#{totals[name].to_i}]"
+    }.join "\t"
+  end
+end
+
+main
+
diff --git a/stats.rb b/stats.rb
new file mode 100755
index 0000000..390cbd9
--- /dev/null
+++ b/stats.rb
@@ -0,0 +1,38 @@
+#!/usr/bin/env ruby
+
+require_relative 'util'
+
+def setup reference_file, stopwords_file
+  references = ReadFile.readlines_strip reference_file
+  stopwords  = read_stopwords_file stopwords_file
+
+  return references, stopwords
+end
+
+def stats references, stopwords
+  references.each { |r|
+    types, uniq_types = get_types r, stopwords
+
+    counts = []
+    uniq_types.each { |t|
+      counts <<  types.count(t)
+    }
+    if counts.size > 0
+      puts counts.inject(:+) / counts.size.to_f
+    end
+  }
+end
+
+def main
+  config = Trollop::options do
+    opt :references, "File with references, truecased and tokenized", :type => :string, :short => "-r", :required => true
+    opt :stopwords, "File with stopwords, one per line",              :type => :string, :short => "-s", :required => true
+  end
+
+  references, stopwords = setup config[:references], config[:stopwords]
+
+  stats references, stopwords
+end
+
+main
+
diff --git a/util.rb b/util.rb
new file mode 100644
index 0000000..d04f823
--- /dev/null
+++ b/util.rb
@@ -0,0 +1,60 @@
+require 'zipf'
+require 'trollop'
+
+def read_stopwords_file fn
+  stopwords = {}
+  f = ReadFile.new fn
+  while line = f.gets
+    stopwords[line.strip] = true
+  end
+
+  return stopwords
+end
+
+def read_vocab_file fn
+  if fn.split(".")[-1] == "dbm"
+    require 'dbm'
+    return DBM.new fn
+  else
+    vocab = {}
+    f = ReadFile.new fn
+    while line = f.gets
+      count, word = line.split
+      vocab[word] = count.to_i
+    end
+
+    return vocab
+  end
+end
+
+# Returns true if string s is only composed of punctuation or brackets
+def is_punct s
+  return s.match(/^[[[:punct:]]\<\>\[\]\{\}\(\)]+$/)
+end
+
+# Returns true if string is all digits
+def is_num s
+  return s.match(/^[[:digit:]]+$/)
+end
+
+# 'Tokenizer' based on spaces
+def get_tokens s
+  return tokenize s
+end
+
+# Returns array of unique tokens and token counts for the string s
+def get_types s, stopwords, vocab=nil, rare_threshold=1.0/0
+  tokens = get_tokens s
+  types = tokens.select { |tok|
+    !stopwords.include?(tok) and not is_punct(tok) and not is_num(tok)
+  }.uniq
+
+  if vocab
+    types = types.select { |t|
+      !vocab.fetch(t, nil) || vocab[t].to_i <= rare_threshold
+    }
+  end
+
+  return types
+end
+
author	Patrick Simianer <p@simianer.de>	2019-04-23 13:55:47 +0200
committer	Patrick Simianer <p@simianer.de>	2019-04-23 13:55:47 +0200
commit	8adea055298189643a3c7a76e2d529f536a94e11 (patch)
tree	59d40a9ae0ea9ac81e9f39aeaa082bab3c7c29bc