diff options
-rwxr-xr-x | avg-seg-len | 9 | ||||
-rwxr-xr-x | cmp | 23 | ||||
-rwxr-xr-x | cumul | 32 | ||||
-rwxr-xr-x | de-bpe | 4 | ||||
-rwxr-xr-x | de-sgm | 2 | ||||
-rw-r--r-- | hist-tok | 24 | ||||
-rwxr-xr-x | overlap | 20 | ||||
-rwxr-xr-x | per-sentence-bleu | 2 |
8 files changed, 114 insertions, 2 deletions
diff --git a/avg-seg-len b/avg-seg-len new file mode 100755 index 0000000..ee68827 --- /dev/null +++ b/avg-seg-len @@ -0,0 +1,9 @@ +#!/usr/bin/env ruby + +lens = [] +while line = STDIN.gets + lens << line.strip.split.size +end + +puts lens.inject(:+)/lens.size.to_f + @@ -0,0 +1,23 @@ +#!/usr/bin/env ruby + +require 'zipf' + +f = ReadFile.new ARGV[0] +g = ReadFile.new ARGV[1] +h = ReadFile.new ARGV[2] + +i = 0 +while line = f.gets + line1 = g.gets + line2 = h.gets + b1 = (BLEU::per_sentence_bleu(line1.strip, line.split, 4)*100).round 1 + b2 = (BLEU::per_sentence_bleu(line2.strip, line.split, 4)*100).round 1 + puts i + puts "ref) " + line + puts "org #{b1}) " + line1 + puts "ada #{b2}) " + line2 + puts " >>> #{(b2-b1).round 1} <<<" + puts + i += 1 +end + @@ -0,0 +1,32 @@ +#!/usr/bin/env ruby + +require 'zipf' + +f = ReadFile.new ARGV[0] +g = ReadFile.new ARGV[1] +h = ReadFile.new ARGV[2] + +refs = [] +sys1 = [] +sys2 = [] +diffs = [] +while line = f.gets + line1 = g.gets + line2 = h.gets + refs << line + sys1 << line1 + sys2 << line2 + + ff=File.new("/tmp/refs",'w+');ff.write(refs.join(""));ff.close + ff=File.new("/tmp/sys1",'w+');ff.write(sys1.join(""));ff.close + ff=File.new("/tmp/sys2",'w+');ff.write(sys2.join(""));ff.close + + #a = `~/multi-bleu.perl /tmp/refs < /tmp/sys1`.split[2].gsub(',','').to_f + a = BLEU::bleu("/tmp/sys1", "/tmp/refs", 4) + b = BLEU::bleu("/tmp/sys2", "/tmp/refs", 4) + diffs << b-a + + #puts ((diffs.inject(:+)/diffs.size)*100).round 2 + puts (diffs[-1]*100).round 2 +end + @@ -0,0 +1,4 @@ +#!/bin/bash + +sed -E "s/(@@ )|(@@$)//g" + @@ -1,4 +1,4 @@ #!/bin/sh -grep -v -P "^[[:space:]]*(<\?xml.*\?>|</?(mteval|doc|srcset|refset)[^>]*>)[[:space:]]*$" | grep -v -P "^[[:space:]]*<(url|description|keywords|talkid|title)>.*</(url|description|keywords|talkid|title)>[[:space:]]*$" | sed "s|<seg[^>]*>\s*||" | sed "s|\s*</seg>$||" +grep -v "^[[:space:]]*(<\?xml.*\?>|</?(mteval|doc|srcset|refset)[^>]*>)[[:space:]]*$" | grep -v "^[[:space:]]*<(url|description|keywords|talkid|title)>.*</(url|description|keywords|talkid|title)>[[:space:]]*$" | sed "s|<seg[^>]*>\s*||" | sed "s|\s*</seg>$||" diff --git a/hist-tok b/hist-tok new file mode 100644 index 0000000..b81604f --- /dev/null +++ b/hist-tok @@ -0,0 +1,24 @@ +#!/usr/bin/env ruby + +counts = {} +counts.default = 0 +while line = STDIN.gets + toks = line.strip.split + toks.each { |tok| + counts[tok] += 1 + } +end + +sorted = [] +counts.each_pair { |k,v| + sorted << [k,v] +} + +sorted.sort_by! { |i| + -i[1] +} + +sorted.each { |i| + puts "#{i[0]}\t#{i[1]}" +} + @@ -0,0 +1,20 @@ +#!/usr/bin/env ruby + +require 'zipf' + +a = {} +a.default = 0 +ReadFile.readlines_strip(ARGV[0]).map { |segment| a[segment] += 1 } +b = {} +b.default = 0 +ReadFile.readlines_strip(ARGV[1]).map { |segment| b[segment] += 1 } + +overlap = 0 +a.each_key { |seg| + puts b[seg] + overlap = overlap+b[seg] +} + +puts "---" +puts overlap + diff --git a/per-sentence-bleu b/per-sentence-bleu index 402f364..e70db2b 100755 --- a/per-sentence-bleu +++ b/per-sentence-bleu @@ -20,7 +20,7 @@ def main puts 0.0 next end - puts BLEU::per_sentence_bleu line.strip, refs[i], conf[:n], conf[:len_hack] + puts BLEU::per_sentence_bleu line.strip, refs[i].split, conf[:n], conf[:len_hack] end input.close end |