From ad309390887b947d997e4040dac98126ee9a356c Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Fri, 11 Dec 2015 16:09:13 +0100 Subject: phrase2 extraction: cosmetics --- phrase2_extraction/example.txt | 3 + phrase2_extraction/phrase2_extraction.rb | 718 ++++++++++++++++++++++++++++++ phrase_alignment/example.txt | 3 - phrase_alignment/phrase_alignment.rb | 735 ------------------------------- server.rb | 2 +- 5 files changed, 722 insertions(+), 739 deletions(-) create mode 100644 phrase2_extraction/example.txt create mode 100755 phrase2_extraction/phrase2_extraction.rb delete mode 100644 phrase_alignment/example.txt delete mode 100755 phrase_alignment/phrase_alignment.rb diff --git a/phrase2_extraction/example.txt b/phrase2_extraction/example.txt new file mode 100644 index 0000000..89bca35 --- /dev/null +++ b/phrase2_extraction/example.txt @@ -0,0 +1,3 @@ +a b c d +w x y z +0-1 1-0 2-2 3-3 diff --git a/phrase2_extraction/phrase2_extraction.rb b/phrase2_extraction/phrase2_extraction.rb new file mode 100755 index 0000000..be17940 --- /dev/null +++ b/phrase2_extraction/phrase2_extraction.rb @@ -0,0 +1,718 @@ +#!/usr/bin/env ruby + +require 'zipf' + +module PhrasePhraseExtraction + +DEBUG = false +MAX_NT = 2 # Chiang: 2 +MAX_SEED_NUM_WORDS = 3 # Chiang: 10 words +MAX_SRC_SZ = 3 # Chiang: 5 words +FORBID_SRC_ADJACENT_SRC_NT = true # Chiang:true + +class Rule + attr_accessor :source, :target, :arity, :source_context, :target_context, :alignment + + def initialize source_range=nil, target_range=nil, source_context=nil, target_context=nil, alignment=[] + if source_context && target_range && source_context && target_context + @source = [source_range] + @target = [target_range] + @source_context = source_context + @target_context = target_context + @alignment = alignment + else + @source = [] + @target = [] + @source_context = [] + @target_context = [] + @alignment = [] + end + @arity = 0 + end + + def hash + self.as_trule_string.hash + end + + def eql? other + self.as_trule_string == other.as_trule_string + end + + def len_src + src_len = 0 + @source.each { |i| + if i.is_a? String + src_len += 1 + else + src_len += i.last-i.first+1 + end + } + + return src_len + end + + def len_tgt + tgt_len = 0 + @target.each { |i| + if i.is_a? String + tgt_len += 1 + else + tgt_len += i.last-i.first+1 + end + } + + return tgt_len + end + + def to_s + source_string = "" + @source.each { |i| + if i.is_a? Range + source_string += @source_context[i].to_s + else + source_string += " #{i} " + end + } + target_string = "" + @target.each { |i| + if i.is_a? Range + target_string += @target_context[i].to_s + else + target_string += " #{i} " + end + } + + astr = "" + @alignment.each { |p| + astr += " #{p.first}-#{p.last}" + } + astr.strip! + + return "#{source_string.gsub(/\s+/, " ").strip} -> #{target_string.gsub(/\s+/, " ").strip} | #{astr}" + end + + def rebase_alignment + min_src = @alignment.map{|p| p.first }.min + min_tgt = @alignment.map{|p| p.last }.min + @alignment.each_with_index { |p,j| + @alignment[j] = [p.first-min_src, p.last-min_tgt] + } + end + + def rebase_alignment1 correct_src, correct_tgt, start_source, start_target + @alignment.each_with_index { |p,j| + if p[0] > start_source + @alignment[j][0] = [0,p.first-correct_src].max + end + if p[1] > start_target + @alignment[j][1] = [0,p.last-correct_tgt].max + end + } + end + + def as_trule_string + source_string = "" + @source.each { |i| + if i.is_a? Range + source_string += @source_context[i].join(" ").strip + else + source_string += " #{i} " + end + } + target_string = "" + @target.each { |i| + if i.is_a? Range + target_string += @target_context[i].join(" ").strip + else + target_string += " #{i} " + end + } + source_string = source_string.lstrip.strip + target_string = target_string.lstrip.strip + + astr = "" + @alignment.each { |p| + astr += " #{p.first}-#{p.last}" + } + astr.strip! + + #source_string.gsub!(/\[X,\d+\]/, "[X]") + return "[X] ||| #{source_string} ||| #{target_string} ||| NewRule=1 ||| #{astr}" + end + + def is_terminal? + #return false if @source.size>1 + #return false if @target.size>1 + @source.each { |i| return false if !i.is_a? Range } + @target.each { |i| return false if !i.is_a? Range } + return true + end + + # check if other_rule is a part of self + def mergeable_with? other_rule + return false if !other_rule.is_terminal? + other_source_begin = other_rule.source.first.first + other_source_end = other_rule.source.first.last + other_target_begin = other_rule.target.first.first + other_target_end = other_rule.target.first.last + b = false + @source.each { |i| + next if !i.is_a? Range + if ( other_source_begin >= i.first \ + && other_source_end <= i.last \ + && (!(other_source_begin==i.first && other_source_end==i.last))) + b = true + break + end + } + return false if !b + @target.each { |i| + next if !i.is_a? Range + if ( other_target_begin >= i.first \ + && other_target_end <= i.last \ + && (!(other_target_begin==i.first && other_target_end==i.last))) + b = true + break + end + } + + return b + end + + def self.split a, b, index=0, p="target" + return "[NEWX,#{index}]"if (a==b) + + aa = a.to_a + begin_split = b.first + end_split = b.last + + p1 = aa[0..aa.index([begin_split-1,aa.first].max)] + p2 = aa[aa.index([end_split+1, aa.last].min)..aa.last] + + nt = "[NEWX,#{index}]" + + ret = nil + if begin_split > a.first && end_split < a.last + ret = [(p1.first..p1.last), nt, (p2.first..p2.last)] + elsif begin_split == a.first + ret = [nt, (p2.first..p2.last)] + elsif end_split == a.last + ret = [(p1.first..p1.last), nt] + end + + return ret + end + + def self.merge r, s + return nil if !r.mergeable_with? s + return nil if !s.is_terminal? + + other_source_begin = s.source.first.first + other_source_end = s.source.first.last + other_target_begin = s.target.first.first + other_target_end = s.target.first.last + + new_rule = Rule.new + new_rule.source_context = r.source_context + new_rule.target_context = r.target_context + new_rule.arity = r.arity+1 + new_rule.alignment = Array.new + r.alignment.each { |p| new_rule.alignment << Array.new(p) } # deep copy + + c = new_rule.arity + done = false + correct_src = 0 + r.source.each_with_index { |i,j| + if i.is_a? Range + if ( !done \ + && other_source_begin >= i.first \ + && other_source_end <= i.last) + new_rule.source << Rule.split(i, (other_source_begin..other_source_end), c, "source") + new_rule.source.flatten! + done = true + else + new_rule.source << i + end + else + new_rule.source << i + end + } + # relabel Xs (linear on source side) + switch = false + k = 1 + new_rule.source.each_with_index { |i,j| + if i.is_a? String + m = i.match(/\[(X|NEWX),(\d+)\]/) + n = m[1] + l = m[2].to_i + if k != l + switch = true + end + new_rule.source[j] = "[#{n},#{k}]" + k += 1 + end + } + STDERR.write "switch #{switch}\n" if DEBUG + done = false + correct_tgt = 0 + r.target.each_with_index { |i,j| + if i.is_a? Range + if ( !done \ + && other_target_begin >= i.first \ + && other_target_end <= i.last) + new_rule.target << Rule.split(i, (other_target_begin..other_target_end), c) + new_rule.target.flatten! + done = true + else + new_rule.target << i + end + else + new_rule.target << i + reorder = true + end + } + + correct_src = r.len_src-new_rule.len_src + correct_tgt = r.len_tgt-new_rule.len_tgt + STDERR.write "correct_src #{correct_src}\n" + STDERR.write "correct_tgt #{correct_tgt}\n" + + start_correct_source = nil + j = 0 + fl = [] + new_rule.source.each { |i| + if i.is_a? Range + fl << new_rule.source_context[i] + else + if i.match(/\[NEWX,\d+\]/) + STDERR.write "j = #{j}\n" + start_correct_source = j + end + fl << i + end + j += 1 + } + fl.flatten! + + start_correct_target = nil + j = 0 + fl.each { |i| + if i.match(/\[NEWX,\d+\]/) + STDERR.write "j = #{j}\n" + start_correct_source = j + break + end + j += 1 + } + + el = [] + new_rule.target.each { |i| + if i.is_a? Range + el << new_rule.target_context[i] + else + el << i + end + j += 1 + } + el.flatten! + + start_correct_target = nil + j = 0 + el.each { |i| + if i.match(/\[NEWX,\d+\]/) + STDERR.write "j = #{j}\n" + start_correct_target = j + break + end + j += 1 + } + + if DEBUG + STDERR.write "start_correct_source = #{start_correct_source}\n" + STDERR.write "start_correct_target = #{start_correct_target}\n" + end + + new_rule.rebase_alignment1 correct_src, correct_tgt, start_correct_source, start_correct_target + STDERR.write "not uniq'ed #{new_rule.alignment.to_s}\n" if DEBUG + new_rule.alignment.uniq! + + if DEBUG + STDERR.write "a before: #{new_rule.alignment.to_s}\n" + STDERR.write "#{fl.to_s}\n" + end + new_rule.alignment.reject! { |p| + !fl[p.first] || !el[p.last] || fl[p.first].match(/\[(NEWX|X),\d+\]/) || el[p.last].match(/\[(NEWX|X),\d+\]/) + } + if DEBUG + STDERR.write "a after: #{new_rule.alignment.to_s}\n" + STDERR.write "old len_src #{r.len_src}\n" + STDERR.write "new len_src #{new_rule.len_src}\n" + STDERR.write "old len_tgt #{r.len_tgt}\n" + STDERR.write "new len_tgt #{new_rule.len_tgt}\n" + end + + if switch + new_rule.target.each_with_index { |i,j| + if i.is_a? String + m = i.match(/\[(X|NEWX),(\d+)\]/) + n = m[1] + k = m[2].to_i + l = nil + if k == 1 + l = 2 + else # 2 + l = 1 + end + new_rule.target[j] = "[#{n},#{l}]" + end + } + end + + new_rule.source.each_with_index { |i,j| + if i.is_a?(String) && i.match(/\[NEWX,\d\]/) + i.gsub!(/NEWX/, "X") + end + } + new_rule.target.each_with_index { |i,j| + if i.is_a?(String) && i.match(/\[NEWX,\d\]/) + i.gsub!(/NEWX/, "X") + end + } + + return new_rule + end + + def expand_fake_alignment + new_alignment = [] + if DEBUG + STDERR.write "#{@alignment.to_s}\n" + STDERR.write "#{@source.to_s}\n" + STDERR.write "#{@target.to_s}\n" + end + fl = @source.map { |i| + if i.is_a? Range + @source_context[i].map{|x|x.split} + else + i + end + }.flatten 1 + el = @target.map { |i| + if i.is_a? Range + @target_context[i].map{|x|x.split} + else + i + end + }.flatten 1 + if DEBUG + STDERR.write "#{fl.to_s}\n" + STDERR.write "#{el.to_s}\n" + STDERR.write "->\n" + end + + offsets_src = {} + #offsets_src.default = 0 + o = 0 + fl.each_with_index { |i,j| + if i.is_a? Array + o += i.size-1 + end + offsets_src[j] = o + } + offsets_tgt = {} + o = 0 + el.each_with_index { |i,j| + if i.is_a? Array + o += i.size-1 + end + offsets_tgt[j] = o + } + + @alignment.each { |p| + if DEBUG + STDERR.write "#{p.to_s}\n" + STDERR.write "#{offsets_src[p.first]} -- #{offsets_tgt[p.last]}\n" + end + new_alignment << [ p.first+offsets_src[p.first], p.last+offsets_tgt[p.last] ] + if DEBUG + STDERR.write "#{new_alignment.last.to_s}\n" + STDERR.write "---\n" + STDERR.write "\n" + end + } + @alignment = new_alignment + end + +end + +def PhrasePhraseExtraction.has_alignment a, i, dir="src" + index = 0 + index = 1 if dir=="tgt" + a.each { |p| + return true if p[index]==i + } + return false +end + +def PhrasePhraseExtraction.extract fstart, fend, estart, eend, f, e, a, flen, elen + a.each { |p| + fi=p[0]; ei=p[1] + if (fstart..fend).include? fi + if eieend + return [] + end + end + if (estart..eend).include? ei + if fifend + return [] + end + end + + } + rules = [] + fs = fstart + loop do + fe = fend + loop do + rules << Rule.new(fs..fe, estart..eend, f, e) + a.each { |p| + if (fs..fe).include?(p.first) + rules.last.alignment << p + end + } + rules.last.rebase_alignment + fe += 1 + break if has_alignment(a, fe, "tgt")||fe>=elen + end + fs -= 1 + break has_alignment(a, fs, "src")||fs<0 + end + + return rules +end + +def PhrasePhraseExtraction.make_gappy_rules rules, seed_rules + MAX_NT.times { + new_rules = [] + rules.each { |r| + seed_rules.each { |s| + if r.mergeable_with? s + new = Rule.merge r, s + new_rules << new + STDERR.write "#{r.to_s} <<< #{s.to_s}\n" if DEBUG + STDERR.write " = #{new.to_s}\n\n" if DEBUG + end + } + } + rules += new_rules + } + + return rules +end + +def PhrasePhraseExtraction.make_seed_rules a, e, f + rules = [] + (0..e.size-1).each { |estart| + (estart..e.size-1).each { |eend| + + fstart = f.size-1 + fend = 0 + a.each { |p| + fi=p[0]; ei=p[1] + if estart<=ei && ei<=eend + fstart = [fi, fstart].min + fend = [fi, fend].max + end + } + next if fstart>fend + STDERR.write "fstart #{fstart}, fend #{fend}, estart #{estart}, eend #{eend}\n" if DEBUG + new_rules = extract fstart, fend, estart, eend, f, e, a, f.size, e.size + new_rules.each { |r| + STDERR.write "#{r.to_s}\n" if DEBUG + } + rules += new_rules + } + } + + return rules +end + +def PhrasePhraseExtraction.extract_rules f, e, as, expand=false + a = [] + as.each { |p| + x,y = p.split "-" + x = x.to_i; y = y.to_i + a << [x,y] + } + rules = PhrasePhraseExtraction.make_seed_rules a, e,f + seed_rules = PhrasePhraseExtraction.remove_too_large_seed_phrases rules + rules = PhrasePhraseExtraction.make_gappy_rules rules, seed_rules + + if PhrasePhraseExtraction::FORBID_SRC_ADJACENT_SRC_NT + rules = PhrasePhraseExtraction.remove_adjacent_nt rules + end + + rules = PhrasePhraseExtraction.remove_too_long_src_sides rules + + if expand + rules.each { |r| r.expand_fake_alignment } + end + + return rules.uniq +end + +def PhrasePhraseExtraction.remove_too_large_seed_phrases rules + return rules.reject { |r| + STDERR.write "#{r}\n" + src_len = r.len_src + tgt_len = r.len_tgt + src_len>PhrasePhraseExtraction::MAX_SEED_NUM_WORDS \ + || tgt_len>PhrasePhraseExtraction::MAX_SEED_NUM_WORDS } +end + +def PhrasePhraseExtraction.remove_adjacent_nt rules + return rules.reject { |r| + b = false + prev = false + r.source.each { |i| + if i.is_a? String + if prev + b = true + break + end + prev = true + else + prev = false + end + } + b + } +end + +def PhrasePhraseExtraction.remove_too_long_src_sides rules + return rules.reject { |r| + r.len_src > PhrasePhraseExtraction::MAX_SRC_SZ + } +end + +def PhrasePhraseExtraction.test + # 0 1 2 3 + # a b c d + # w x y z + # 0-0 + # 1-3 + # 2-2 + # 3-1 + ra = Rule.new + rb = Rule.new + ra.source = [(0..2), "[X,1]"] + ra.target = [(0..0), "[X,1]", (2..3)] + ra.source_context = ["a", "b", "c", "d"] + ra.target_context = ["w", "x", "y", "z"] + ra.alignment = [[0,0],[1,3],[2,2]] + ra.arity = 1 + rb.source = [(1..1)] + rb.target = [(3..3)] + rb.source_context = ["a", "b", "c", "d"] + rb.target_context = ["w", "x", "y", "z"] + rb.alignment = [[0,0]] + rb.arity = 0 + + puts ra.mergeable_with? rb + nr = Rule.merge ra, rb + puts ra.to_s + puts rb.to_s + puts nr.to_s +end + +def PhrasePhraseExtraction.test_phrase + ra = Rule.new + rb = Rule.new + ra.source = [(0..2), "[X,1]"] + ra.target = [(0..0), "[X,1]", (2..3)] + ra.source_context = ["a a", "b b", "c c", "d d"] + ra.target_context = ["w w", "x x", "y y", "z z"] + ra.alignment = [[0,0],[1,3],[2,2]] + #ra.expand_fake_alignment + ra.arity = 1 + rb.source = [(1..1)] + rb.target = [(3..3)] + rb.source_context = ra.source_context + rb.target_context = rb.source_context + rb.alignment = [[0,0]] + #rb.expand_fake_alignment + rb.arity = 0 + + puts ra.mergeable_with? rb + nr = Rule.merge ra, rb + puts ra.to_s + puts rb.to_s + nr.expand_fake_alignment + puts nr.to_s +end + +def PhrasePhraseExtraction.test_phrase1 + source_context = ["a", "b", "c", "Blechbänder", ", besteht", "der Spreizdorn im wesentlichen", "aus", "x"] + target_context = ["w", "x", "y", "the expansion", "mandrel consists", "essentially of expansion mandrel", "z"] + + ra = Rule.new + ra.source = ["[X,1]", (3..6)] + ra.target = ["[X,1]", (3..5)] + ra.source_context = source_context + ra.target_context = target_context + ra.alignment = [[1,1],[2,2],[3,3],[4,2]] + ra.arity = 1 + + rb = Rule.new + rb.source = [(4..6)] + rb.target = [(4..5)] + rb.source_context = source_context + rb.target_context = target_context + rb.alignment = [[0,0],[1,1],[2,0]] + rb.arity = 0 + + puts ra.mergeable_with? rb + nr = Rule.merge ra, rb + puts ra.to_s + puts rb.to_s + nr.expand_fake_alignment + puts nr.to_s +end + +end # module + +def main + file = ReadFile.new ARGV[0] + + f = file.gets.split + e = file.gets.split + a = [] + file.gets.split.each { |p| + x,y = p.split "-" + x = x.to_i; y = y.to_i + a << [x,y] + } + rules = PhrasePhraseExtraction.make_seed_rules a, e, f + seed_rules = PhrasePhraseExtraction.remove_too_large_seed_phrases rules + rules = PhrasePhraseExtraction.make_gappy_rules rules, seed_rules + + if PhrasePhraseExtraction::FORBID_SRC_ADJACENT_SRC_NT + rules = PhrasePhraseExtraction.remove_adjacent_nt rules + end + + rules = PhrasePhraseExtraction.remove_too_long_src_sides rules + + rules.uniq! + + rules.each { |r| + puts r.as_trule_string + } +end +main + +def test + PhrasePhraseExtraction.test + PhrasePhraseExtraction.test_phrase + PhrasePhraseExtraction.test_phrase1 +end +#test + diff --git a/phrase_alignment/example.txt b/phrase_alignment/example.txt deleted file mode 100644 index 89bca35..0000000 --- a/phrase_alignment/example.txt +++ /dev/null @@ -1,3 +0,0 @@ -a b c d -w x y z -0-1 1-0 2-2 3-3 diff --git a/phrase_alignment/phrase_alignment.rb b/phrase_alignment/phrase_alignment.rb deleted file mode 100755 index 1c1a0ed..0000000 --- a/phrase_alignment/phrase_alignment.rb +++ /dev/null @@ -1,735 +0,0 @@ -#!/usr/bin/env ruby - -require 'zipf' - - -module PhrasePhraseExtraction - -DEBUG=true -MAX_NT=2 # chiang:2 -MAX_SEED_NUM_WORDS=3 # chiang:10 words phrases! -MAX_SRC_SZ=3 # chiang:5 words phrases! -FORBID_SRC_ADJACENT_SRC_NT=true # chiang:true - -class Rule - attr_accessor :source, :target, :arity, :source_context, :target_context, :alignment - - def initialize source_range=nil, target_range=nil, source_context=nil, target_context=nil, alignment=[] - if source_context && target_range && source_context && target_context - @source = [source_range] - @target = [target_range] - @source_context = source_context - @target_context = target_context - @alignment = alignment - else - @source = [] - @target = [] - @source_context = [] - @target_context = [] - @alignment = [] - end - @arity = 0 - end - - def <=> other_rule - end - - def hash - self.as_trule_string.hash - end - - def eql? other - self.as_trule_string == other.as_trule_string - end - - def len_src - src_len = 0 - @source.each { |i| - if i.is_a? String - src_len += 1 - else - src_len += i.last-i.first+1 - end - } - - return src_len - end - - def len_tgt - tgt_len = 0 - @target.each { |i| - if i.is_a? String - tgt_len += 1 - else - tgt_len += i.last-i.first+1 - end - } - - return tgt_len - end - - def len - src_len = 0 - @source.each { |i| - if i.is_a? String - src_len += 1 - else - src_len += i.last-i.first+1 - end - } - tgt_len = 0 - @target.each { |i| - if i.is_a? String - tgt_len += 1 - else - tgt_len += i.last-i.first+1 - end - } - return [src_len, tgt_len] - end - - def to_s - source_string = "" - @source.each { |i| - if i.is_a? Range - source_string += @source_context[i].to_s - else - source_string += " #{i} " - end - } - target_string = "" - @target.each { |i| - if i.is_a? Range - target_string += @target_context[i].to_s - else - target_string += " #{i} " - end - } - - astr = "" - @alignment.each { |p| - astr += " #{p.first}-#{p.last}" - } - astr.strip! - - return "#{source_string.gsub(/\s+/, " ").strip} -> #{target_string.gsub(/\s+/, " ").strip} | #{astr}" - end - - def base_alignment - min_src = @alignment.map{|p| p.first }.min - min_tgt = @alignment.map{|p| p.last }.min - @alignment.each_with_index { |p,j| - @alignment[j] = [p.first-min_src, p.last-min_tgt] - } - end - - def base_alignment2 correct_src, correct_tgt, start_source, start_target - @alignment.each_with_index { |p,j| - if p[0] > start_source - @alignment[j][0] = [0,p.first-correct_src].max - end - if p[1] > start_target - @alignment[j][1] = [0,p.last-correct_tgt].max - end - } - end - - def as_trule_string - source_string = "" - @source.each { |i| - if i.is_a? Range - source_string += @source_context[i].join(" ").strip - else - source_string += " #{i} " - end - } - target_string = "" - @target.each { |i| - if i.is_a? Range - target_string += @target_context[i].join(" ").strip - else - target_string += " #{i} " - end - } - source_string = source_string.lstrip.strip - target_string = target_string.lstrip.strip - - astr = "" - @alignment.each { |p| - astr += " #{p.first}-#{p.last}" - } - astr.strip! - - #source_string.gsub!(/\[X,\d+\]/, "[X]") - return "[X] ||| #{source_string} ||| #{target_string} ||| NewRule=1 ||| #{astr}" - end - - def is_terminal? - #return false if @source.size>1 - #return false if @target.size>1 - @source.each { |i| return false if !i.is_a? Range } - @target.each { |i| return false if !i.is_a? Range } - return true - end - - # check if other_rule is a part of self - def mergeable_with? other_rule - return false if !other_rule.is_terminal? - other_source_begin = other_rule.source.first.first - other_source_end = other_rule.source.first.last - other_target_begin = other_rule.target.first.first - other_target_end = other_rule.target.first.last - b = false - @source.each { |i| - next if !i.is_a? Range - if ( other_source_begin >= i.first \ - && other_source_end <= i.last \ - && (!(other_source_begin==i.first && other_source_end==i.last))) - b = true - break - end - } - return false if !b - @target.each { |i| - next if !i.is_a? Range - if ( other_target_begin >= i.first \ - && other_target_end <= i.last \ - && (!(other_target_begin==i.first && other_target_end==i.last))) - b = true - break - end - } - - return b - end - - def self.split a, b, index=0, p="target" - return "[NEWX,#{index}]"if (a==b) - - aa = a.to_a - begin_split = b.first - end_split = b.last - - p1 = aa[0..aa.index([begin_split-1,aa.first].max)] - p2 = aa[aa.index([end_split+1, aa.last].min)..aa.last] - - nt = "[NEWX,#{index}]" - - ret = nil - if begin_split > a.first && end_split < a.last - ret = [(p1.first..p1.last), nt, (p2.first..p2.last)] - elsif begin_split == a.first - ret = [nt, (p2.first..p2.last)] - elsif end_split == a.last - ret = [(p1.first..p1.last), nt] - end - - return ret - end - - def self.merge r, s - return nil if !r.mergeable_with? s - return nil if !s.is_terminal? - - other_source_begin = s.source.first.first - other_source_end = s.source.first.last - other_target_begin = s.target.first.first - other_target_end = s.target.first.last - - new_rule = Rule.new - new_rule.source_context = r.source_context - new_rule.target_context = r.target_context - new_rule.arity = r.arity+1 - new_rule.alignment = Array.new - r.alignment.each { |p| new_rule.alignment << Array.new(p) } # deep copy - - c = new_rule.arity - done = false - correct_src = 0 - r.source.each_with_index { |i,j| - if i.is_a? Range - if ( !done \ - && other_source_begin >= i.first \ - && other_source_end <= i.last) - new_rule.source << Rule.split(i, (other_source_begin..other_source_end), c, "source") - new_rule.source.flatten! - done = true - else - new_rule.source << i - end - else - new_rule.source << i - end - } - # relabel Xs (linear) - switch = false - k = 1 - new_rule.source.each_with_index { |i,j| - if i.is_a? String - m = i.match(/\[(X|NEWX),(\d+)\]/) - n = m[1] - l = m[2].to_i - if k != l - switch = true - end - new_rule.source[j] = "[#{n},#{k}]" - k += 1 - end - } - puts "switch #{switch}" if DEBUG - done = false - correct_tgt = 0 - r.target.each_with_index { |i,j| - if i.is_a? Range - if ( !done \ - && other_target_begin >= i.first \ - && other_target_end <= i.last) - new_rule.target << Rule.split(i, (other_target_begin..other_target_end), c) - new_rule.target.flatten! - done = true - else - new_rule.target << i - end - else - new_rule.target << i - reorder = true - end - } - - correct_src = r.len_src-new_rule.len_src - correct_tgt = r.len_tgt-new_rule.len_tgt - puts "correct_src #{correct_src}" - puts "correct_tgt #{correct_tgt}" - - start_correct_source = nil - j = 0 - fl = [] - new_rule.source.each { |i| - if i.is_a? Range - fl << new_rule.source_context[i] - else - if i.match(/\[NEWX,\d+\]/) - puts "j = #{j}" - start_correct_source = j - end - fl << i - end - j += 1 - } - fl.flatten! - - start_correct_target = nil - j = 0 - fl.each { |i| - if i.match(/\[NEWX,\d+\]/) - puts "j = #{j}" - start_correct_source = j - break - end - j += 1 - } - - el = [] - new_rule.target.each { |i| - if i.is_a? Range - el << new_rule.target_context[i] - else - el << i - end - j += 1 - } - el.flatten! - - start_correct_target = nil - j = 0 - el.each { |i| - if i.match(/\[NEWX,\d+\]/) - puts "j = #{j}" - start_correct_target = j - break - end - j += 1 - } - - puts "start_correct_source = #{start_correct_source}" - puts "start_correct_target = #{start_correct_target}" - - new_rule.base_alignment2 correct_src, correct_tgt, start_correct_source, start_correct_target - puts "not uniq #{new_rule.alignment.to_s}" - new_rule.alignment.uniq! - - puts "a before: #{new_rule.alignment.to_s}" - puts fl.to_s - new_rule.alignment.reject! { |p| - !fl[p.first] || !el[p.last] || fl[p.first].match(/\[(NEWX|X),\d+\]/) || el[p.last].match(/\[(NEWX|X),\d+\]/) - } - puts "a after: #{new_rule.alignment.to_s}" - puts "old len_src #{r.len_src}" - puts "new len_src #{new_rule.len_src}" - puts "old len_tgt #{r.len_tgt}" - puts "new len_tgt #{new_rule.len_tgt}" - - if switch - new_rule.target.each_with_index { |i,j| - if i.is_a? String - m = i.match(/\[(X|NEWX),(\d+)\]/) - n = m[1] - k = m[2].to_i - l = nil - if k == 1 - l = 2 - else # 2 - l = 1 - end - new_rule.target[j] = "[#{n},#{l}]" - end - } - end - - new_rule.source.each_with_index { |i,j| - if i.is_a?(String) && i.match(/\[NEWX,\d\]/) - i.gsub!(/NEWX/, "X") - end - } - new_rule.target.each_with_index { |i,j| - if i.is_a?(String) && i.match(/\[NEWX,\d\]/) - i.gsub!(/NEWX/, "X") - end - } - - return new_rule - end - - def expand_fake_alignment - new_alignment = [] - if DEBUG - puts @alignment.to_s - puts @source.to_s - puts @target.to_s - end - fl = @source.map { |i| - if i.is_a? Range - @source_context[i].map{|x|x.split} - else - i - end - }.flatten 1 - el = @target.map { |i| - if i.is_a? Range - @target_context[i].map{|x|x.split} - else - i - end - }.flatten 1 - if DEBUG - puts fl.to_s - puts el.to_s - puts "->" - end - - offsets_src = {} - #offsets_src.default = 0 - o = 0 - fl.each_with_index { |i,j| - if i.is_a? Array - o += i.size-1 - end - offsets_src[j] = o - } - offsets_tgt = {} - #offsets_tgt.default = 0 - o = 0 - el.each_with_index { |i,j| - if i.is_a? Array - o += i.size-1 - end - offsets_tgt[j] = o - } - - @alignment.each { |p| - if DEBUG - puts p.to_s - puts "#{offsets_src[p.first]} -- #{offsets_tgt[p.last]}" - end - new_alignment << [ p.first+offsets_src[p.first], p.last+offsets_tgt[p.last] ] - if DEBUG - puts new_alignment.last.to_s - puts "---" - puts - end - } - @alignment = new_alignment - end - -end - -def PhrasePhraseExtraction.has_alignment a, i, dir="src" - index = 0 - index = 1 if dir=="tgt" - a.each { |p| - return true if p[index]==i - } - return false -end - -def PhrasePhraseExtraction.extract fstart, fend, estart, eend, f, e, a, flen, elen - a.each { |p| - fi=p[0]; ei=p[1] - if (fstart..fend).include? fi - if eieend - return [] - end - end - if (estart..eend).include? ei - if fifend - return [] - end - end - - } - rules = [] - fs = fstart - loop do - fe = fend - loop do - rules << Rule.new(fs..fe, estart..eend, f, e) - a.each { |p| - if (fs..fe).include?(p.first) - rules.last.alignment << p - end - } - rules.last.base_alignment - fe += 1 - break if has_alignment(a, fe, "tgt")||fe>=elen - end - fs -= 1 - break has_alignment(a, fs, "src")||fs<0 - end - - return rules -end - -def PhrasePhraseExtraction.make_gappy_rules rules, seed_rules - MAX_NT.times { - new_rules = [] - rules.each { |r| - seed_rules.each { |s| - if r.mergeable_with? s - new = Rule.merge r, s - new_rules << new - puts "#{r.to_s} <<< #{s.to_s}" if DEBUG - puts " = #{new.to_s}\n\n" if DEBUG - end - } - } - rules += new_rules - } - - return rules -end - -def PhrasePhraseExtraction.make_seed_rules a, e, f - rules = [] - (0..e.size-1).each { |estart| - (estart..e.size-1).each { |eend| - - fstart = f.size-1 - fend = 0 - a.each { |p| - fi=p[0]; ei=p[1] - if estart<=ei && ei<=eend - fstart = [fi, fstart].min - fend = [fi, fend].max - end - } - next if fstart>fend - puts "fstart #{fstart}, fend #{fend}, estart #{estart}, eend #{eend}" if DEBUG - new_rules = extract fstart, fend, estart, eend, f, e, a, f.size, e.size - new_rules.each { |r| - puts r.to_s if DEBUG - } - rules += new_rules - } - } - - return rules -end - -def PhrasePhraseExtraction.test - # 0 1 2 3 - # a b c d - # w x y z - # 0-0 - # 1-3 - # 2-2 - # 3-1 - ra = Rule.new - rb = Rule.new - ra.source = [(0..2), "[X,1]"] - ra.target = [(0..0), "[X,1]", (2..3)] - ra.source_context = ["a", "b", "c", "d"] - ra.target_context = ["w", "x", "y", "z"] - ra.alignment = [[0,0],[1,3],[2,2]] - ra.arity = 1 - rb.source = [(1..1)] - rb.target = [(3..3)] - rb.source_context = ["a", "b", "c", "d"] - rb.target_context = ["w", "x", "y", "z"] - rb.alignment = [[0,0]] - rb.arity = 0 - - puts ra.mergeable_with? rb - nr = Rule.merge ra, rb - puts ra.to_s - puts rb.to_s - puts nr.to_s -end - -def PhrasePhraseExtraction.test_phrase - ra = Rule.new - rb = Rule.new - ra.source = [(0..2), "[X,1]"] - ra.target = [(0..0), "[X,1]", (2..3)] - ra.source_context = ["a a", "b b", "c c", "d d"] - ra.target_context = ["w w", "x x", "y y", "z z"] - ra.alignment = [[0,0],[1,3],[2,2]] - #ra.expand_fake_alignment - ra.arity = 1 - rb.source = [(1..1)] - rb.target = [(3..3)] - rb.source_context = ra.source_context - rb.target_context = rb.source_context - rb.alignment = [[0,0]] - #rb.expand_fake_alignment - rb.arity = 0 - - puts ra.mergeable_with? rb - nr = Rule.merge ra, rb - puts ra.to_s - puts rb.to_s - nr.expand_fake_alignment - puts nr.to_s -end - -def PhrasePhraseExtraction.test_phrase2 - source_context = ["a", "b", "c", "Blechbänder", ", besteht", "der Spreizdorn im wesentlichen", "aus", "x"] - target_context = ["w", "x", "y", "the expansion", "mandrel consists", "essentially of expansion mandrel", "z"] - - ra = Rule.new - ra.source = ["[X,1]", (3..6)] - ra.target = ["[X,1]", (3..5)] - ra.source_context = source_context - ra.target_context = target_context - ra.alignment = [[1,1],[2,2],[3,3],[4,2]] - ra.arity = 1 - - rb = Rule.new - rb.source = [(4..6)] - rb.target = [(4..5)] - rb.source_context = source_context - rb.target_context = target_context - rb.alignment = [[0,0],[1,1],[2,0]] - rb.arity = 0 - - puts ra.mergeable_with? rb - nr = Rule.merge ra, rb - puts ra.to_s - puts rb.to_s - nr.expand_fake_alignment - puts nr.to_s -end - -def PhrasePhraseExtraction.extract_rules f, e, as, expand=false - a = [] - as.each { |p| - x,y = p.split "-" - x = x.to_i; y = y.to_i - a << [x,y] - } - rules = PhrasePhraseExtraction.make_seed_rules a, e,f - seed_rules = PhrasePhraseExtraction.remove_too_large_seed_phrases rules - rules = PhrasePhraseExtraction.make_gappy_rules rules, seed_rules - - if PhrasePhraseExtraction::FORBID_SRC_ADJACENT_SRC_NT - rules = PhrasePhraseExtraction.remove_adj_nt rules - end - - rules = PhrasePhraseExtraction.remove_too_long_src_sides rules - - if expand - rules.each { |r| r.expand_fake_alignment } - end - - return rules.uniq -end - -def PhrasePhraseExtraction.remove_too_large_seed_phrases rules - return rules.reject { |r| - src_len, tgt_len = r.len - src_len>PhrasePhraseExtraction::MAX_SEED_NUM_WORDS \ - || tgt_len>PhrasePhraseExtraction::MAX_SEED_NUM_WORDS } -end - -def PhrasePhraseExtraction.remove_adj_nt rules - return rules.reject { |r| - b = false - prev = false - r.source.each { |i| - if i.is_a? String - if prev - b = true - break - end - prev = true - else - prev = false - end - } - b - } -end - -def PhrasePhraseExtraction.remove_too_long_src_sides rules - return rules.reject { |r| - r.len.first > PhrasePhraseExtraction::MAX_SRC_SZ - } -end - -end # module - -def main - file = ReadFile.new ARGV[0] - - f = file.gets.split - e = file.gets.split - a = [] - file.gets.split.each { |p| - x,y = p.split "-" - x = x.to_i; y = y.to_i - a << [x,y] - } - rules = PhrasePhraseExtraction.make_seed_rules a, e, f - seed_rules = PhrasePhraseExtraction.remove_too_large_seed_phrases rules - rules = PhrasePhraseExtraction.make_gappy_rules rules, seed_rules - - if PhrasePhraseExtraction::FORBID_SRC_ADJACENT_SRC_NT - rules = PhrasePhraseExtraction.remove_adj_nt rules - end - - rules = PhrasePhraseExtraction.remove_too_long_src_sides rules - - rules.uniq! - - rules.each { |r| - puts r.as_trule_string - } -end -#main - -def test - #PhrasePhraseExtraction.test - #PhrasePhraseExtraction.test_phrase - PhrasePhraseExtraction.test_phrase2 -end -test - diff --git a/server.rb b/server.rb index 9ad49f3..f976ee6 100755 --- a/server.rb +++ b/server.rb @@ -9,7 +9,7 @@ require 'json' require 'haml' require 'uri' require_relative './derivation_to_json/derivation_to_json' -require_relative './phrase_alignment/phrase_alignment' +require_relative './phrase2_extraction/phrase2_extraction' # ############################################################################# # Load configuration file and setup global variables -- cgit v1.2.3