From 958ce7102f5b0c9fac27d164394c8b13cf8b24f2 Mon Sep 17 00:00:00 2001 From: Patrick Simianer
Date: Fri, 4 Dec 2015 00:33:21 +0100 Subject: integrated phrase-phrase-extraction, save svg --- help.inc.php | 2 +- interface.php | 4 +- js/debug.js | 4 + js/lfpe.js | 1 + phrase_alignment/example.txt | 3 + phrase_alignment/phrase_alignment.rb | 735 +++++++++++++++++++++++++++++++++++ server.rb | 40 +- views/debug.haml | 2 + 8 files changed, 786 insertions(+), 5 deletions(-) create mode 100644 phrase_alignment/example.txt create mode 100755 phrase_alignment/phrase_alignment.rb diff --git a/help.inc.php b/help.inc.php index 5132620..e571a6d 100644 --- a/help.inc.php +++ b/help.inc.php @@ -1,4 +1,4 @@ -
Press the 'Next' button to submit your post-edit and to request the next segment for post-edition. +
Press the 'Next' button to submit your post-edit and to request the next segment for post-edition. Alternatively, in the textual interface, you may just press return when you finished the post-edit ('Target' text area is in focus).
The session can be paused at any time and continued later; However, if you have to pause your session, wait until the activity notification disappears and then press 'Pause', as we are collecting timing information. You may also just reload this site and re-request the segment to reset the timer.
diff --git a/interface.php b/interface.php index d8b7a72..dd08b10 100644 --- a/interface.php +++ b/interface.php @@ -91,8 +91,8 @@ foreach($db->raw_source_segments as $s) { - -Support: Mail
Session: # | Debug
diff --git a/js/debug.js b/js/debug.js index 76c7b80..633a5c4 100644 --- a/js/debug.js +++ b/js/debug.js @@ -7,5 +7,9 @@ $().ready(function() }}); }) }) + + var d = atob(document.getElementById("svg_b64").innerHTML); + $('#svg').append($('')); + }) diff --git a/js/lfpe.js b/js/lfpe.js index 334fd16..9e7b9af 100644 --- a/js/lfpe.js +++ b/js/lfpe.js @@ -320,6 +320,7 @@ function Next() $("#next").html("Next"); $("#oov_tgt0").focus(); not_working(false); + DE_locked = true; // translation mode } else { diff --git a/phrase_alignment/example.txt b/phrase_alignment/example.txt new file mode 100644 index 0000000..89bca35 --- /dev/null +++ b/phrase_alignment/example.txt @@ -0,0 +1,3 @@ +a b c d +w x y z +0-1 1-0 2-2 3-3 diff --git a/phrase_alignment/phrase_alignment.rb b/phrase_alignment/phrase_alignment.rb new file mode 100755 index 0000000..9d29798 --- /dev/null +++ b/phrase_alignment/phrase_alignment.rb @@ -0,0 +1,735 @@ +#!/usr/bin/env ruby + +require 'zipf' + + +module PhrasePhraseExtraction + +DEBUG=true +MAX_NT=2 # chiang:2 +MAX_SEED_NUM_WORDS=3 # chiang:10 words phrases! +MAX_SRC_SZ=3 # chiang:5 words phrases! +FORBID_SRC_ADJACENT_SRC_NT=true # chiang:true + +class Rule + attr_accessor :source, :target, :arity, :source_context, :target_context, :alignment + + def initialize source_range=nil, target_range=nil, source_context=nil, target_context=nil, alignment=[] + if source_context && target_range && source_context && target_context + @source = [source_range] + @target = [target_range] + @source_context = source_context + @target_context = target_context + @alignment = alignment + else + @source = [] + @target = [] + @source_context = [] + @target_context = [] + @alignment = [] + end + @arity = 0 + end + + def <=> other_rule + end + + def hash + self.as_trule_string.hash + end + + def eql? other + self.as_trule_string == other.as_trule_string + end + + def len_src + src_len = 0 + @source.each { |i| + if i.is_a? String + src_len += 1 + else + src_len += i.last-i.first+1 + end + } + + return src_len + end + + def len_tgt + tgt_len = 0 + @target.each { |i| + if i.is_a? String + tgt_len += 1 + else + tgt_len += i.last-i.first+1 + end + } + + return tgt_len + end + + def len + src_len = 0 + @source.each { |i| + if i.is_a? String + src_len += 1 + else + src_len += i.last-i.first+1 + end + } + tgt_len = 0 + @target.each { |i| + if i.is_a? String + tgt_len += 1 + else + tgt_len += i.last-i.first+1 + end + } + return [src_len, tgt_len] + end + + def to_s + source_string = "" + @source.each { |i| + if i.is_a? Range + source_string += @source_context[i].to_s + else + source_string += " #{i} " + end + } + target_string = "" + @target.each { |i| + if i.is_a? Range + target_string += @target_context[i].to_s + else + target_string += " #{i} " + end + } + + astr = "" + @alignment.each { |p| + astr += " #{p.first}-#{p.last}" + } + astr.strip! + + return "#{source_string.gsub(/\s+/, " ").strip} -> #{target_string.gsub(/\s+/, " ").strip} | #{astr}" + end + + def base_alignment + min_src = @alignment.map{|p| p.first }.min + min_tgt = @alignment.map{|p| p.last }.min + @alignment.each_with_index { |p,j| + @alignment[j] = [p.first-min_src, p.last-min_tgt] + } + end + + def base_alignment2 correct_src, correct_tgt, start_source, start_target + @alignment.each_with_index { |p,j| + if p[0] > start_source + @alignment[j][0] = [0,p.first-correct_src].max + end + if p[1] > start_target + @alignment[j][1] = [0,p.last-correct_tgt].max + end + } + end + + def as_trule_string + source_string = "" + @source.each { |i| + if i.is_a? Range + source_string += @source_context[i].join(" ").strip + else + source_string += " #{i} " + end + } + target_string = "" + @target.each { |i| + if i.is_a? Range + target_string += @target_context[i].join(" ").strip + else + target_string += " #{i} " + end + } + source_string = source_string.lstrip.strip + target_string = target_string.lstrip.strip + + astr = "" + @alignment.each { |p| + astr += " #{p.first}-#{p.last}" + } + astr.strip! + + #source_string.gsub!(/\[X,\d+\]/, "[X]") + return "[X] ||| #{source_string} ||| #{target_string} ||| NewRule=1 ||| #{astr}" + end + + def is_terminal? + #return false if @source.size>1 + #return false if @target.size>1 + @source.each { |i| return false if !i.is_a? Range } + @target.each { |i| return false if !i.is_a? Range } + return true + end + + # check if other_rule is a part of self + def mergeable_with? other_rule + return false if !other_rule.is_terminal? + other_source_begin = other_rule.source.first.first + other_source_end = other_rule.source.first.last + other_target_begin = other_rule.target.first.first + other_target_end = other_rule.target.first.last + b = false + @source.each { |i| + next if !i.is_a? Range + if ( other_source_begin >= i.first \ + && other_source_end <= i.last \ + && (!(other_source_begin==i.first && other_source_end==i.last))) + b = true + break + end + } + return false if !b + @target.each { |i| + next if !i.is_a? Range + if ( other_target_begin >= i.first \ + && other_target_end <= i.last \ + && (!(other_target_begin==i.first && other_target_end==i.last))) + b = true + break + end + } + + return b + end + + def self.split a, b, index=0, p="target" + return "[NEWX,#{index}]"if (a==b) + + aa = a.to_a + begin_split = b.first + end_split = b.last + + p1 = aa[0..aa.index([begin_split-1,aa.first].max)] + p2 = aa[aa.index([end_split+1, aa.last].min)..aa.last] + + nt = "[NEWX,#{index}]" + + ret = nil + if begin_split > a.first && end_split < a.last + ret = [(p1.first..p1.last), nt, (p2.first..p2.last)] + elsif begin_split == a.first + ret = [nt, (p2.first..p2.last)] + elsif end_split == a.last + ret = [(p1.first..p1.last), nt] + end + + return ret + end + + def self.merge r, s + return nil if !r.mergeable_with? s + return nil if !s.is_terminal? + + other_source_begin = s.source.first.first + other_source_end = s.source.first.last + other_target_begin = s.target.first.first + other_target_end = s.target.first.last + + new_rule = Rule.new + new_rule.source_context = r.source_context + new_rule.target_context = r.target_context + new_rule.arity = r.arity+1 + new_rule.alignment = Array.new + r.alignment.each { |p| new_rule.alignment << Array.new(p) } # deep copy + + c = new_rule.arity + done = false + correct_src = 0 + r.source.each_with_index { |i,j| + if i.is_a? Range + if ( !done \ + && other_source_begin >= i.first \ + && other_source_end <= i.last) + new_rule.source << Rule.split(i, (other_source_begin..other_source_end), c, "source") + new_rule.source.flatten! + done = true + else + new_rule.source << i + end + else + new_rule.source << i + end + } + # relabel Xs (linear) + switch = false + k = 1 + new_rule.source.each_with_index { |i,j| + if i.is_a? String + m = i.match(/\[(X|NEWX),(\d+)\]/) + n = m[1] + l = m[2].to_i + if k != l + switch = true + end + new_rule.source[j] = "[#{n},#{k}]" + k += 1 + end + } + puts "switch #{switch}" if DEBUG + done = false + correct_tgt = 0 + r.target.each_with_index { |i,j| + if i.is_a? Range + if ( !done \ + && other_target_begin >= i.first \ + && other_target_end <= i.last) + new_rule.target << Rule.split(i, (other_target_begin..other_target_end), c) + new_rule.target.flatten! + done = true + else + new_rule.target << i + end + else + new_rule.target << i + reorder = true + end + } + + correct_src = r.len_src-new_rule.len_src + correct_tgt = r.len_tgt-new_rule.len_tgt + puts "correct_src #{correct_src}" + puts "correct_tgt #{correct_tgt}" + + start_correct_source = nil + j = 0 + fl = [] + new_rule.source.each { |i| + if i.is_a? Range + fl << new_rule.source_context[i] + else + if i.match(/\[NEWX,\d+\]/) + puts "j = #{j}" + start_correct_source = j + end + fl << i + end + j += 1 + } + fl.flatten! + + start_correct_target = nil + j = 0 + fl.each { |i| + if i.match(/\[NEWX,\d+\]/) + puts "j = #{j}" + start_correct_source = j + break + end + j += 1 + } + + el = [] + new_rule.target.each { |i| + if i.is_a? Range + el << new_rule.target_context[i] + else + el << i + end + j += 1 + } + el.flatten! + + start_correct_target = nil + j = 0 + el.each { |i| + if i.match(/\[NEWX,\d+\]/) + puts "j = #{j}" + start_correct_target = j + break + end + j += 1 + } + + puts "start_correct_source = #{start_correct_source}" + puts "start_correct_target = #{start_correct_target}" + + new_rule.base_alignment2 correct_src, correct_tgt, start_correct_source, start_correct_target + puts "not uniq #{new_rule.alignment.to_s}" + new_rule.alignment.uniq! + + puts "a before: #{new_rule.alignment.to_s}" + puts fl.to_s + new_rule.alignment.reject! { |p| + !fl[p.first] || !el[p.last] || fl[p.first].match(/\[(NEWX|X),\d+\]/) || el[p.last].match(/\[(NEWX|X),\d+\]/) + } + puts "a after: #{new_rule.alignment.to_s}" + puts "old len_src #{r.len_src}" + puts "new len_src #{new_rule.len_src}" + puts "old len_tgt #{r.len_tgt}" + puts "new len_tgt #{new_rule.len_tgt}" + + if switch + new_rule.target.each_with_index { |i,j| + if i.is_a? String + m = i.match(/\[(X|NEWX),(\d+)\]/) + n = m[1] + k = m[2].to_i + l = nil + if k == 1 + l = 2 + else # 2 + l = 1 + end + new_rule.target[j] = "[#{n},#{l}]" + end + } + end + + new_rule.source.each_with_index { |i,j| + if i.is_a?(String) && i.match(/\[NEWX,\d\]/) + i.gsub!(/NEWX/, "X") + end + } + new_rule.target.each_with_index { |i,j| + if i.is_a?(String) && i.match(/\[NEWX,\d\]/) + i.gsub!(/NEWX/, "X") + end + } + + return new_rule + end + + def expand_fake_alignment + new_alignment = [] + if DEBUG + puts @alignment.to_s + puts @source.to_s + puts @target.to_s + end + fl = @source.map { |i| + if i.is_a? Range + @source_context[i].map{|x|x.split} + else + i + end + }.flatten 1 + el = @target.map { |i| + if i.is_a? Range + @target_context[i].map{|x|x.split} + else + i + end + }.flatten 1 + if DEBUG + puts fl.to_s + puts el.to_s + puts "->" + end + + offsets_src = {} + #offsets_src.default = 0 + o = 0 + fl.each_with_index { |i,j| + if i.is_a? Array + o += i.size-1 + end + offsets_src[j] = o + } + offsets_tgt = {} + #offsets_tgt.default = 0 + o = 0 + el.each_with_index { |i,j| + if i.is_a? Array + o += i.size-1 + end + offsets_tgt[j] = o + } + + @alignment.each { |p| + if DEBUG + puts p.to_s + puts "#{offsets_src[p.first]} -- #{offsets_tgt[p.last]}" + end + new_alignment << [ p.first+offsets_src[p.first], p.last+offsets_tgt[p.last] ] + if DEBUG + puts new_alignment.last.to_s + puts "---" + puts + end + } + @alignment = new_alignment + end + +end + +def PhrasePhraseExtraction.has_alignment a, i, dir="src" + index = 0 + index = 1 if dir=="tgt" + a.each { |p| + return true if p[index]==i + } + return false +end + +def PhrasePhraseExtraction.extract fstart, fend, estart, eend, f, e, a, flen, elen + a.each { |p| + fi=p[0]; ei=p[1] + if (fstart..fend).include? fi + if ei