summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xavg-seg-len9
-rwxr-xr-xcmp23
-rwxr-xr-xcumul32
-rwxr-xr-xde-bpe4
-rwxr-xr-xde-sgm2
-rw-r--r--hist-tok24
-rwxr-xr-xoverlap20
-rwxr-xr-xper-sentence-bleu2
8 files changed, 114 insertions, 2 deletions
diff --git a/avg-seg-len b/avg-seg-len
new file mode 100755
index 0000000..ee68827
--- /dev/null
+++ b/avg-seg-len
@@ -0,0 +1,9 @@
+#!/usr/bin/env ruby
+
+lens = []
+while line = STDIN.gets
+ lens << line.strip.split.size
+end
+
+puts lens.inject(:+)/lens.size.to_f
+
diff --git a/cmp b/cmp
new file mode 100755
index 0000000..ed8460c
--- /dev/null
+++ b/cmp
@@ -0,0 +1,23 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+
+f = ReadFile.new ARGV[0]
+g = ReadFile.new ARGV[1]
+h = ReadFile.new ARGV[2]
+
+i = 0
+while line = f.gets
+ line1 = g.gets
+ line2 = h.gets
+ b1 = (BLEU::per_sentence_bleu(line1.strip, line.split, 4)*100).round 1
+ b2 = (BLEU::per_sentence_bleu(line2.strip, line.split, 4)*100).round 1
+ puts i
+ puts "ref) " + line
+ puts "org #{b1}) " + line1
+ puts "ada #{b2}) " + line2
+ puts " >>> #{(b2-b1).round 1} <<<"
+ puts
+ i += 1
+end
+
diff --git a/cumul b/cumul
new file mode 100755
index 0000000..93a7e90
--- /dev/null
+++ b/cumul
@@ -0,0 +1,32 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+
+f = ReadFile.new ARGV[0]
+g = ReadFile.new ARGV[1]
+h = ReadFile.new ARGV[2]
+
+refs = []
+sys1 = []
+sys2 = []
+diffs = []
+while line = f.gets
+ line1 = g.gets
+ line2 = h.gets
+ refs << line
+ sys1 << line1
+ sys2 << line2
+
+ ff=File.new("/tmp/refs",'w+');ff.write(refs.join(""));ff.close
+ ff=File.new("/tmp/sys1",'w+');ff.write(sys1.join(""));ff.close
+ ff=File.new("/tmp/sys2",'w+');ff.write(sys2.join(""));ff.close
+
+ #a = `~/multi-bleu.perl /tmp/refs < /tmp/sys1`.split[2].gsub(',','').to_f
+ a = BLEU::bleu("/tmp/sys1", "/tmp/refs", 4)
+ b = BLEU::bleu("/tmp/sys2", "/tmp/refs", 4)
+ diffs << b-a
+
+ #puts ((diffs.inject(:+)/diffs.size)*100).round 2
+ puts (diffs[-1]*100).round 2
+end
+
diff --git a/de-bpe b/de-bpe
new file mode 100755
index 0000000..0ba49af
--- /dev/null
+++ b/de-bpe
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+sed -E "s/(@@ )|(@@$)//g"
+
diff --git a/de-sgm b/de-sgm
index fa28301..5c3a5d5 100755
--- a/de-sgm
+++ b/de-sgm
@@ -1,4 +1,4 @@
#!/bin/sh
-grep -v -P "^[[:space:]]*(<\?xml.*\?>|</?(mteval|doc|srcset|refset)[^>]*>)[[:space:]]*$" | grep -v -P "^[[:space:]]*<(url|description|keywords|talkid|title)>.*</(url|description|keywords|talkid|title)>[[:space:]]*$" | sed "s|<seg[^>]*>\s*||" | sed "s|\s*</seg>$||"
+grep -v "^[[:space:]]*(<\?xml.*\?>|</?(mteval|doc|srcset|refset)[^>]*>)[[:space:]]*$" | grep -v "^[[:space:]]*<(url|description|keywords|talkid|title)>.*</(url|description|keywords|talkid|title)>[[:space:]]*$" | sed "s|<seg[^>]*>\s*||" | sed "s|\s*</seg>$||"
diff --git a/hist-tok b/hist-tok
new file mode 100644
index 0000000..b81604f
--- /dev/null
+++ b/hist-tok
@@ -0,0 +1,24 @@
+#!/usr/bin/env ruby
+
+counts = {}
+counts.default = 0
+while line = STDIN.gets
+ toks = line.strip.split
+ toks.each { |tok|
+ counts[tok] += 1
+ }
+end
+
+sorted = []
+counts.each_pair { |k,v|
+ sorted << [k,v]
+}
+
+sorted.sort_by! { |i|
+ -i[1]
+}
+
+sorted.each { |i|
+ puts "#{i[0]}\t#{i[1]}"
+}
+
diff --git a/overlap b/overlap
new file mode 100755
index 0000000..81f9c4b
--- /dev/null
+++ b/overlap
@@ -0,0 +1,20 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+
+a = {}
+a.default = 0
+ReadFile.readlines_strip(ARGV[0]).map { |segment| a[segment] += 1 }
+b = {}
+b.default = 0
+ReadFile.readlines_strip(ARGV[1]).map { |segment| b[segment] += 1 }
+
+overlap = 0
+a.each_key { |seg|
+ puts b[seg]
+ overlap = overlap+b[seg]
+}
+
+puts "---"
+puts overlap
+
diff --git a/per-sentence-bleu b/per-sentence-bleu
index 402f364..e70db2b 100755
--- a/per-sentence-bleu
+++ b/per-sentence-bleu
@@ -20,7 +20,7 @@ def main
puts 0.0
next
end
- puts BLEU::per_sentence_bleu line.strip, refs[i], conf[:n], conf[:len_hack]
+ puts BLEU::per_sentence_bleu line.strip, refs[i].split, conf[:n], conf[:len_hack]
end
input.close
end