diff options
author | Patrick Simianer <p@simianer.de> | 2015-09-19 14:47:05 +0200 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2015-09-19 14:47:05 +0200 |
commit | 1576f31ff739288dc21c120ab8c04dbe5406cad7 (patch) | |
tree | e0b05fa1ce8f466f896b90cd89277578f89a2c69 /derivation_to_json/deriv.rb | |
parent | 084f652eb431aafd36d0ea64dcd37d4eac7cd393 (diff) |
code to convert cdec's output to json for derivation editor
Diffstat (limited to 'derivation_to_json/deriv.rb')
-rwxr-xr-x | derivation_to_json/deriv.rb | 151 |
1 files changed, 151 insertions, 0 deletions
diff --git a/derivation_to_json/deriv.rb b/derivation_to_json/deriv.rb new file mode 100755 index 0000000..e6230c3 --- /dev/null +++ b/derivation_to_json/deriv.rb @@ -0,0 +1,151 @@ +#!/usr/bin/env ruby + +require 'zipf' + +def conv_cdec_show_deriv s + s.gsub! /\(/, "\n" + s.gsub! /[{}]/, "" + a = s.split "\n" + a.map! { |i| + i.strip.gsub /(\s*\))+$/, "" + } + a.reject! { |i| i.strip=="" } + return a +end + +class RuleAndSpan + attr_accessor :span, :symbol, :source, :target, :subspans, :done, :id + + def initialize s, id + spans, srcs, tgts = splitpipe s.strip + _,@symbol = spans.split + @span = _.split(",", 2).map { |i| i.gsub(/[<>]/, "").to_i} + @source = srcs.strip.split + j = 0 + @source.map! { |i| if i.match(/\[X\]/) then "[#{j+=1}]" else i end } + @target = tgts.strip.split + @subspans = [] + @done = false + @id = id + end + + def to_s + return "#{@id}\t<#{@span.first},#{@span[1]}> [X] ||| #{@source.join " "} ||| #{@target.join " "}" + end + + def is_terminal_rule? + @source.each { |w| + return true if !w.match(/\[S|(\d+)\]/) + } + + return false + end +end + +def proc_deriv s +a = conv_cdec_show_deriv s + +by_span = {} +spans = [] +id = 0 +a.each { |line| + rs = RuleAndSpan.new line, id + id += 1 + by_span[rs.span] = rs + if rs.is_terminal_rule? + spans << rs.span + end +} + +spans.reverse.each_with_index { |s,k| + (spans.size-(k+2)).downto(0) { |l| + t = spans[l] + if s[0] >= t[0] and s[1] <= t[1] + by_span[t].subspans << s + break + end + } +} + +# fix order +spans.each { |s| + by_span[s].subspans.reverse! +} + +def derive span, spans, by_span, o, groups, source + if groups.size==0 || groups.last.size>0 + groups << [] + end + a = nil + if source + a = span.source + else + a = span.target + end + a.each_with_index { |w,k| + nt = w.match /\[(\d+)\]/ + if nt + idx = nt.captures.first.to_i-1 + _ = derive by_span[span.subspans[idx]], spans, by_span, o, groups, source + (k+1).upto(a.size-1) { |i| + if !a[i].match(/\[(\d+)\]/) && groups.last.size>0 + groups << [] + end + } + else + groups.last << ["#{w}", span.id] + o << w + end + } + span.done = true +end + +so = [] +source_groups = [] +spans.each { |span| + next if by_span[span].done + derive by_span[span], spans, by_span, so, source_groups, true +} +#puts "SOURCE" +#puts so.join " " +#puts source_groups.to_s +#puts "##{source_groups.size}" + +spans.each { |s| by_span[s].done = false } + +o = [] +groups = [] +spans.each { |span| + next if by_span[span].done + derive by_span[span], spans, by_span, o, groups, false +} +#puts "TARGET" +#puts o.join " " +#puts groups.to_s +#puts "##{groups.size}" + +source_rgroups = [] +rgroups = [] +source_groups.each { |i| source_rgroups << i.first[1] } +groups.each { |i| rgroups << i.first[1] } + +phrase_align = [] +source_rgroups.each { |i| + phrase_align << [] + rgroups.each_with_index { |j,k| + if i==j + phrase_align.last << k + end + } +} + +h = {} +h[:phrase_alignment] = phrase_align +h[:source_groups] = source_groups.map { |a| a.map { |i| i.first } } +h[:target_groups] = groups.map { |a| a.map { |i| i.first } } + +return h.to_json +end + +puts proc_deriv STDIN.gets.strip + |