#!/usr/bin/env ruby require 'zipf' module PhrasePhraseExtraction DEBUG = false MAX_NT = 1 # Chiang: 2 MAX_SEED_NUM_WORDS = 3 # Chiang: 10 words, -> phrases! MAX_SRC_SZ = 8 # Chiang: 5 words, -> words! FORBID_SRC_ADJACENT_SRC_NT = true # Chiang:true class Rule attr_accessor :source, :target, :arity, :source_context, :target_context, :alignment def initialize source_range=nil, target_range=nil, source_context=nil, target_context=nil, alignment=[] if source_context && target_range && source_context && target_context @source = [source_range] @target = [target_range] @source_context = source_context @target_context = target_context @alignment = alignment else @source = [] @target = [] @source_context = [] @target_context = [] @alignment = [] end @arity = 0 end def hash self.as_trule_string.hash end def eql? other self.as_trule_string == other.as_trule_string end def len_src src_len = 0 @source.each { |i| if i.is_a? String src_len += 1 else src_len += i.last-i.first+1 end } return src_len end def len_src_w src_len = 0 @source.each { |i| if i.is_a? String src_len += i.split.size #1 else i.each { |j| src_len += source_context[j].split.size } end } return src_len end def len_tgt tgt_len = 0 @target.each { |i| if i.is_a? String tgt_len += 1 else tgt_len += i.last-i.first+1 end } return tgt_len end def len_tgt_w tgt_len = 0 @target.each { |i| if i.is_a? String tgt_len += i.split.size else i.each { |j| tgt_len += target_context[j].split.size } end } return tgt_len end def to_s source_string = "" @source.each { |i| if i.is_a? Range source_string += @source_context[i].to_s else source_string += " #{i} " end } target_string = "" @target.each { |i| if i.is_a? Range target_string += @target_context[i].to_s else target_string += " #{i} " end } astr = "" @alignment.each { |p| astr += " #{p.first}-#{p.last}" } astr.strip! return "#{source_string.gsub(/\s+/, " ").strip} -> #{target_string.gsub(/\s+/, " ").strip} | #{astr}" end def rebase_alignment offset_src, offset_tgt STDERR.write "a before rebasing #{@alignment.to_s}\n" if DEBUG @alignment.each_with_index { |p,j| @alignment[j] = [p.first-offset_src, p.last-offset_tgt] } STDERR.write "a after rebasing #{@alignment.to_s}\n" if DEBUG end def rebase_alignment1 correct_src, correct_tgt, start_source, start_target @alignment.each_with_index { |p,j| if p[0] > start_source @alignment[j][0] = [0,p.first-correct_src].max end if p[1] > start_target @alignment[j][1] = [0,p.last-correct_tgt].max end } end def get_source_string source_string = "" @source.each { |i| if i.is_a? Range source_string += @source_context[i].join(" ").strip else source_string += " #{i} " end } source_string = source_string.lstrip.strip return source_string end def get_target_string target_string = "" @target.each { |i| if i.is_a? Range target_string += @target_context[i].join(" ").strip else target_string += " #{i} " end } target_string = target_string.lstrip.strip return target_string end def as_trule_string source_string = get_source_string target_string = get_target_string astr = "" @alignment.each { |p| astr += " #{p.first}-#{p.last}" } astr.strip! return "[X] ||| #{source_string} ||| #{target_string} ||| NewRule=1 ||| #{astr}" end def is_terminal? @source.each { |i| return false if !i.is_a? Range } @target.each { |i| return false if !i.is_a? Range } return true end def mergeable_with? other_rule # check if other_rule is a part of self return false if !other_rule.is_terminal? other_source_begin = other_rule.source.first.first other_source_end = other_rule.source.first.last other_target_begin = other_rule.target.first.first other_target_end = other_rule.target.first.last b = false @source.each { |i| next if !i.is_a? Range if ( other_source_begin >= i.first \ && other_source_end <= i.last \ && (!(other_source_begin==i.first && other_source_end==i.last))) b = true break end } return false if !b bb = false @target.each { |i| next if !i.is_a? Range if ( other_target_begin >= i.first \ && other_target_end <= i.last \ && (!(other_target_begin==i.first && other_target_end==i.last))) bb = true break end } return b&&bb end def self.split a, b, index=0, p="target" return "[NEWX,#{index}]"if (a==b) aa = a.to_a begin_split = b.first end_split = b.last p1 = aa[0..aa.index([begin_split-1,aa.first].max)] p2 = aa[aa.index([end_split+1, aa.last].min)..aa.last] nt = "[NEWX,#{index}]" ret = nil if begin_split > a.first && end_split < a.last ret = [(p1.first..p1.last), nt, (p2.first..p2.last)] elsif begin_split == a.first ret = [nt, (p2.first..p2.last)] elsif end_split == a.last ret = [(p1.first..p1.last), nt] end return ret end def self.merge r, s if DEBUG STDERR.write "#{r.to_s}" STDERR.write "#{s.to_s}" end return nil if !r.mergeable_with? s return nil if !s.is_terminal? other_source_begin = s.source.first.first other_source_end = s.source.first.last other_target_begin = s.target.first.first other_target_end = s.target.first.last new_rule = Rule.new new_rule.source_context = r.source_context new_rule.target_context = r.target_context new_rule.arity = r.arity+1 new_rule.alignment = Array.new r.alignment.each { |p| new_rule.alignment << Array.new(p) } # deep copy c = new_rule.arity done = false correct_src = 0 r.source.each_with_index { |i,j| if i.is_a? Range if ( !done \ && other_source_begin >= i.first \ && other_source_end <= i.last) new_rule.source << Rule.split(i, (other_source_begin..other_source_end), c, "source") new_rule.source.flatten! done = true else new_rule.source << i end else new_rule.source << i end } # relabel Xs (linear on source side) switch = false k = 1 new_rule.source.each_with_index { |i,j| if i.is_a? String m = i.match(/\[(X|NEWX),(\d+)\]/) n = m[1] l = m[2].to_i if k != l switch = true end new_rule.source[j] = "[#{n},#{k}]" k += 1 end } STDERR.write "switch #{switch}\n" if DEBUG done = false correct_tgt = 0 r.target.each_with_index { |i,j| if i.is_a? Range if ( !done \ && other_target_begin >= i.first \ && other_target_end <= i.last) new_rule.target << Rule.split(i, (other_target_begin..other_target_end), c) new_rule.target.flatten! done = true else new_rule.target << i end else new_rule.target << i reorder = true end } correct_src = r.len_src-new_rule.len_src correct_tgt = r.len_tgt-new_rule.len_tgt if DEBUG STDERR.write "correct_src #{correct_src}\n" STDERR.write "correct_tgt #{correct_tgt}\n" end start_correct_source = nil j = 0 fl = [] new_rule.source.each { |i| if i.is_a? Range fl << new_rule.source_context[i] else if i.match(/\[NEWX,\d+\]/) STDERR.write "j = #{j}\n" if DEBUG start_correct_source = j end fl << i end j += 1 } fl.flatten! start_correct_target = nil j = 0 fl.each { |i| if i.match(/\[NEWX,\d+\]/) STDERR.write "j = #{j}\n" if DEBUG start_correct_source = j break end j += 1 } el = [] new_rule.target.each { |i| if i.is_a? Range el << new_rule.target_context[i] else el << i end j += 1 } el.flatten! start_correct_target = nil j = 0 el.each { |i| if i.match(/\[NEWX,\d+\]/) STDERR.write "j = #{j}\n" if DEBUG start_correct_target = j break end j += 1 } if DEBUG STDERR.write "start_correct_source = #{start_correct_source}\n" STDERR.write "start_correct_target = #{start_correct_target}\n" end new_rule.rebase_alignment1 correct_src, correct_tgt, start_correct_source, start_correct_target STDERR.write "not uniq'ed #{new_rule.alignment.to_s}\n" if DEBUG new_rule.alignment.uniq! if DEBUG STDERR.write "a before: #{new_rule.alignment.to_s}\n" STDERR.write "#{fl.to_s}\n" end new_rule.alignment.reject! { |p| !fl[p.first] || !el[p.last] || fl[p.first].match(/\[(NEWX|X),\d+\]/) || el[p.last].match(/\[(NEWX|X),\d+\]/) } if DEBUG STDERR.write "a after: #{new_rule.alignment.to_s}\n" STDERR.write "old len_src #{r.len_src}\n" STDERR.write "new len_src #{new_rule.len_src}\n" STDERR.write "old len_tgt #{r.len_tgt}\n" STDERR.write "new len_tgt #{new_rule.len_tgt}\n" end if switch new_rule.target.each_with_index { |i,j| if i.is_a? String m = i.match(/\[(X|NEWX),(\d+)\]/) n = m[1] k = m[2].to_i l = nil if k == 1 l = 2 else # 2 l = 1 end new_rule.target[j] = "[#{n},#{l}]" end } end new_rule.source.each_with_index { |i,j| if i.is_a?(String) && i.match(/\[NEWX,\d\]/) i.gsub!(/NEWX/, "X") end } new_rule.target.each_with_index { |i,j| if i.is_a?(String) && i.match(/\[NEWX,\d\]/) i.gsub!(/NEWX/, "X") end } return new_rule end def expand_fake_alignment new_alignment = [] if DEBUG STDERR.write "#{@alignment.to_s}\n" STDERR.write "#{@source.to_s}\n" STDERR.write "#{@target.to_s}\n" end fl = @source.map { |i| if i.is_a? Range @source_context[i].map{|x|x.split} else i end }.flatten 1 el = @target.map { |i| if i.is_a? Range @target_context[i].map{|x|x.split} else i end }.flatten 1 if DEBUG STDERR.write "#{fl.to_s}\n" STDERR.write "#{el.to_s}\n" STDERR.write "->\n" end offsets_src = {} o = 0 fl.each_with_index { |i,j| if i.is_a? Array o += i.size-1 end offsets_src[j] = o } offsets_tgt = {} o = 0 el.each_with_index { |i,j| if i.is_a? Array o += i.size-1 end offsets_tgt[j] = o } @alignment.each { |p| if DEBUG STDERR.write "#{p.to_s}\n" STDERR.write "#{offsets_src[p.first]} -- #{offsets_tgt[p.last]}\n" end new_alignment << [ p.first+offsets_src[p.first], p.last+offsets_tgt[p.last] ] if DEBUG STDERR.write "#{new_alignment.last.to_s}\n" STDERR.write "---\n" STDERR.write "\n" end } @alignment = new_alignment end end def PhrasePhraseExtraction.make_gappy_rules rules, seed_rules MAX_NT.times { new_rules = [] rules.each { |r| seed_rules.each { |s| if r.mergeable_with? s new = Rule.merge r, s new_rules << new STDERR.write "#{r.to_s} <<< #{s.to_s}\n" if DEBUG STDERR.write " = #{new.to_s}\n\n" if DEBUG end } } rules += new_rules } return rules end def PhrasePhraseExtraction.has_alignment a, i, dir="src" index = 0 index = 1 if dir=="tgt" a.each { |p| return true if p[index]==i } return false end def PhrasePhraseExtraction.extract fstart, fend, estart, eend, f, e, a, flen, elen a.each { |p| fi=p[0]; ei=p[1] if (fstart..fend).include? fi if eieend return [] end end if (estart..eend).include? ei if fifend return [] end end } rules = [] fs = fstart loop do fe = fend loop do rules << Rule.new(fs..fe, estart..eend, f, e) a.each { |p| if (fs..fe).include?(p.first) rules.last.alignment << p end } rules.last.rebase_alignment fs, estart fe += 1 break if has_alignment(a, fe, "src")||fe>=flen end fs -= 1 break has_alignment(a, fs, "src")||fs<0 end return rules end def PhrasePhraseExtraction.make_seed_rules a, e, f rules = [] (0..e.size-1).each { |estart| (estart..e.size-1).each { |eend| fstart = f.size-1 fend = 0 a.each { |p| fi=p[0]; ei=p[1] if estart<=ei && ei<=eend fstart = [fi, fstart].min fend = [fi, fend].max end } next if fstart>fend STDERR.write "fstart #{fstart}, fend #{fend}, estart #{estart}, eend #{eend}, fsize #{f.size}, esize #{e.size}\n" if DEBUG new_rules = extract fstart, fend, estart, eend, f, e, a, f.size, e.size new_rules.each { |r| STDERR.write "#{r.to_s}\n" if DEBUG } rules += new_rules } } return rules end def PhrasePhraseExtraction.extract_rules f, e, as, expand=false if DEBUG STDERR.write "----------\n" STDERR.write "#{f.to_s}\n" STDERR.write "#{e.to_s}\n" STDERR.write "#{as.to_s}\n" STDERR.write "----------\n" end a = [] as.each { |p| x,y = p.split "-" x = x.to_i; y = y.to_i a << [x,y] } rules = PhrasePhraseExtraction.make_seed_rules a, e,f seed_rules = PhrasePhraseExtraction.remove_too_large_seed_phrases rules seed_rules.uniq! { |r| "#{r.get_source_string} ||| #{r.get_target_string}" } if DEBUG STDERR.write "seed rules:\n" seed_rules.each { |r| STDERR.write "#{r.to_s}" } end rules = PhrasePhraseExtraction.make_gappy_rules rules, seed_rules if PhrasePhraseExtraction::FORBID_SRC_ADJACENT_SRC_NT rules = PhrasePhraseExtraction.remove_adjacent_nt rules end rules = PhrasePhraseExtraction.remove_too_long_src_sides rules if expand rules.each { |r| r.expand_fake_alignment } end rules.reject! { |r| r.alignment.size == 0 } return rules end def PhrasePhraseExtraction.remove_too_large_seed_phrases rules return rules.reject { |r| STDERR.write "#{r}\n" if DEBUG src_len = r.len_src tgt_len = r.len_tgt src_len>PhrasePhraseExtraction::MAX_SEED_NUM_WORDS \ || tgt_len>PhrasePhraseExtraction::MAX_SEED_NUM_WORDS } end def PhrasePhraseExtraction.remove_adjacent_nt rules return rules.reject { |r| b = false prev = false r.source.each { |i| if i.is_a? String if prev b = true break end prev = true else prev = false end } b =begin c = false prev = false r.target.each { |i| if i.is_a? String if prev c = true break end prev = true else prev = false end } b || c =end } end def PhrasePhraseExtraction.remove_too_long_src_sides rules return rules.reject { |r| r.len_src_w > PhrasePhraseExtraction::MAX_SRC_SZ } end def PhrasePhraseExtraction.test # 0 1 2 3 # a b c d # w x y z # 0-0 # 1-3 # 2-2 # 3-1 ra = Rule.new rb = Rule.new ra.source = [(0..2), "[X,1]"] ra.target = [(0..0), "[X,1]", (2..3)] ra.source_context = ["a", "b", "c", "d"] ra.target_context = ["w", "x", "y", "z"] ra.alignment = [[0,0],[1,3],[2,2]] ra.arity = 1 rb.source = [(1..1)] rb.target = [(3..3)] rb.source_context = ["a", "b", "c", "d"] rb.target_context = ["w", "x", "y", "z"] rb.alignment = [[0,0]] rb.arity = 0 puts ra.mergeable_with? rb nr = Rule.merge ra, rb puts ra.to_s puts rb.to_s puts nr.to_s end def PhrasePhraseExtraction.test_phrase ra = Rule.new rb = Rule.new ra.source = [(0..2), "[X,1]"] ra.target = [(0..0), "[X,1]", (2..3)] ra.source_context = ["a a", "b b", "c c", "d d"] ra.target_context = ["w w", "x x", "y y", "z z"] ra.alignment = [[0,0],[1,3],[2,2]] ra.expand_fake_alignment ra.arity = 1 rb.source = [(1..1)] rb.target = [(3..3)] rb.source_context = ra.source_context rb.target_context = rb.source_context rb.alignment = [[0,0]] rb.expand_fake_alignment rb.arity = 0 puts ra.mergeable_with? rb nr = Rule.merge ra, rb puts ra.to_s puts rb.to_s nr.expand_fake_alignment puts nr.to_s end def PhrasePhraseExtraction.test_phrase1 source_context = ["a", "b", "c", "Blechbänder", ", besteht", "der Spreizdorn im wesentlichen", "aus", "x"] target_context = ["w", "x", "y", "the expansion", "mandrel consists", "essentially of expansion mandrel", "z", "xxx", "XXX"] ra = Rule.new ra.source = ["[X,1]", (3..6)] ra.target = ["[X,1]", (3..5)] ra.source_context = source_context ra.target_context = target_context ra.alignment = [[1,1],[2,2],[3,3],[4,2]] ra.arity = 1 rb = Rule.new rb.source = [(4..6)] rb.target = [(4..5)] rb.source_context = source_context rb.target_context = target_context rb.alignment = [[0,0],[1,1],[2,0]] rb.arity = 0 puts ra.mergeable_with? rb nr = Rule.merge ra, rb puts ra.to_s puts rb.to_s nr.expand_fake_alignment puts nr.to_s end def PhrasePhraseExtraction.test_phrase2 f = ["synergistische", "pharmazeutische"] e = ["synergistic", "XXX", "pharmaceutical"] a = ["0-0", "1-2"] new_rules = extract_rules f, e, a, false new_rules.each { |r| puts r.to_s } end end # module def main file = ReadFile.new ARGV[0] f = file.gets.split e = file.gets.split a = [] file.gets.split.each { |p| x,y = p.split "-" x = x.to_i; y = y.to_i a << [x,y] } rules = PhrasePhraseExtraction.make_seed_rules a, e, f seed_rules = PhrasePhraseExtraction.remove_too_large_seed_phrases rules rules = PhrasePhraseExtraction.make_gappy_rules rules, seed_rules if PhrasePhraseExtraction::FORBID_SRC_ADJACENT_SRC_NT rules = PhrasePhraseExtraction.remove_adjacent_nt rules end rules = PhrasePhraseExtraction.remove_too_long_src_sides rules rules.uniq! { |r| "#{r.get_source_string} ||| #{r.get_target_string}" } rules.each { |r| puts r.as_trule_string } end #main def test PhrasePhraseExtraction.test PhrasePhraseExtraction.test_phrase PhrasePhraseExtraction.test_phrase1 PhrasePhraseExtraction.test_phrase2 end #test