From 138aadb9fc2e868beece86743539634aa3664502 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Thu, 5 Nov 2015 20:16:01 +0100 Subject: derivation_to_json: word alignment as hint for group alignment --- derivation_to_json/README | 4 +- derivation_to_json/derivation_to_json.rb | 134 +++++++++++++++++++++++++++++-- derivation_to_json/example.2.json | 1 + derivation_to_json/example.2.output | 1 + derivation_to_json/example.2.raw | 10 +++ derivation_to_json/example.3.json | 1 + derivation_to_json/example.3.output | 1 + derivation_to_json/example.3.raw | 13 +++ 8 files changed, 156 insertions(+), 9 deletions(-) create mode 100644 derivation_to_json/example.2.json create mode 100644 derivation_to_json/example.2.output create mode 100644 derivation_to_json/example.2.raw create mode 100644 derivation_to_json/example.3.json create mode 100644 derivation_to_json/example.3.output create mode 100644 derivation_to_json/example.3.raw (limited to 'derivation_to_json') diff --git a/derivation_to_json/README b/derivation_to_json/README index 9aa15a1..947cefd 100644 --- a/derivation_to_json/README +++ b/derivation_to_json/README @@ -1,2 +1,4 @@ -This hack reads cdec's "--show_derivations" into a an object. +This (horrid) hack reads cdec's "--show_derivations" and "--extract_rules" +into data structures and tries to align "groups" in source and target sides +of rules in a smart, presentable way. diff --git a/derivation_to_json/derivation_to_json.rb b/derivation_to_json/derivation_to_json.rb index f4706f3..afd62b7 100755 --- a/derivation_to_json/derivation_to_json.rb +++ b/derivation_to_json/derivation_to_json.rb @@ -1,9 +1,10 @@ #!/usr/bin/env ruby require 'zipf' +require 'stringio' class RuleAndSpan - attr_accessor :span, :symbol, :source, :target, :subspans, :done, :id + attr_accessor :span, :symbol, :source, :target, :subspans, :done, :id, :trule def initialize s, id spans, srcs, tgts = splitpipe s.strip @@ -29,17 +30,112 @@ class RuleAndSpan return false end + + def match_with_rule r + if @source.join(" ").gsub(/\[\d+\]/, "[X]")==r.f \ + && @target.join(" ").gsub(/\[\d+\]/, "[X]")==r.e + return true + end + + return false + end +end + +class Rule + attr_accessor :nt, :f, :e, :v, :a, :ha, :source_groups, :target_groups + + def initialize s + splitpipe(s).each_with_index { |i,j| + i = i.strip.lstrip + if j == 0 # NT + @nt = i + elsif j == 1 # french + @f = i.gsub(/\[\d+\]/, "[X]") + @fa = @f.split + @source_groups = @f.split("[X]").map{|i|i.strip.lstrip} + @source_groups.reject! { |i| i=="" } + elsif j == 2 # english + @e = i.gsub(/\[\d+\]/, "[X]") + @ea = @e.split + @target_groups = @e.split("[X]").map{|i|i.strip.lstrip} + @target_groups.reject! { |i| i=="" } + elsif j == 3 # vector + @v = i + elsif j == 4 # alignment + @a = i + @ha = {} + @a.split.each { |i| + x,y = i.split("-") + x = x.to_i + y = y.to_i + rx = 0 + (0).upto(x-1) { |k| + if @fa[k].match /\[X\]/ + rx += 1 + end + } + ry = 0 + (0).upto(y-1) { |k| + if @ea[k].match /\[X\]/ + ry += 1 + end + } + x -= rx + y -= ry + if @ha[x] + @ha[x] << y + else + @ha[x] = [y] + end + } + else # error + end + } + end + + def group_has_link ngroup_source, ngroup_target + offset_source = 0 + (0).upto(ngroup_source-1) { |i| + offset_source += @source_groups[i].split.size + } + offset_target = 0 + (0).upto(ngroup_target-1) { |i| + offset_target += @target_groups[i].split.size + } + (offset_source).upto(-1+offset_source+@source_groups[ngroup_source].split.size) { |i| + next if !@ha[i] + @ha[i].each { |k| + if (offset_target..(-1+offset_target+@target_groups[ngroup_target].split.size)).include? k + return true + end + } + } + + return false + end + + def to_s + "#{@nt} ||| #{@f} ||| #{@e} ||| #{@v} ||| #{@a}\n" + end end def conv_cdec_show_deriv s - a = s.split("}").map { |i| + rules = [] + xx = StringIO.new s + d_s = xx.gets + while line = xx.gets + r = Rule.new(line) + rules << r + end + + a = d_s.split("}").map { |i| i.gsub /^[()\s]*/, "" }.reject { |i| i.size==0 }.map { |i| i.gsub /^\{/, "" } - return a + return a, rules end def derive span, spans, by_span, o, groups, source @@ -63,7 +159,7 @@ def derive span, spans, by_span, o, groups, source end } else - groups.last << ["#{w}", span.id] + groups.last << ["#{w}", span.id, span.trule] o << w end } @@ -71,13 +167,18 @@ def derive span, spans, by_span, o, groups, source end def proc_deriv s - a = conv_cdec_show_deriv s + a, rules = conv_cdec_show_deriv s by_span = {} spans = [] id = 0 a.each { |line| rs = RuleAndSpan.new line, id + rules.each { |r| + if rs.match_with_rule r + rs.trule = r + end + } id += 1 by_span[rs.span] = rs if rs.is_terminal_rule? @@ -120,15 +221,28 @@ def proc_deriv s rgroups = [] source_groups.each { |i| source_rgroups << i.first[1] } groups.each { |i| rgroups << i.first[1] } + rules_by_span_id = {} + source_groups.each { |i| + rules_by_span_id[i.first[1]] = i.first[2] + } phrase_align = [] - source_rgroups.each { |i| + count_source = {} + count_target = {} + count_source.default = 0 + count_target.default = 0 + source_rgroups.each_with_index { |i| phrase_align << [] rgroups.each_with_index { |j,k| if i==j - phrase_align.last << k + if rules_by_span_id[i].group_has_link count_source[i], count_target[j] + phrase_align.last << k + end + count_target[j] += 1 end } + count_source[i] += 1 + count_target.clear } h = {} @@ -140,7 +254,11 @@ def proc_deriv s end if __FILE__ == $0 - json = proc_deriv(STDIN.gets.strip) + s = "" + while line = STDIN.gets + s += line + end + json = proc_deriv(s) obj = JSON.parse(json) STDERR.write "#{json}\n" puts obj["target_groups"].join " " diff --git a/derivation_to_json/example.2.json b/derivation_to_json/example.2.json new file mode 100644 index 0000000..e3e05fd --- /dev/null +++ b/derivation_to_json/example.2.json @@ -0,0 +1 @@ +{"phrase_alignment":[[0],[1],[2],[3],[4],[5],[6],[7]],"source_groups":["die","neuerung","bezieht sich auf gassensoren","auf basis von","metalloxid @-@ halbleitern ,","die sehr","empfindlich und wenig temperaturabhängig","sind ."],"target_groups":["the","invention","relates to gas sensors","which are based on metal","@-@ oxide semiconductors and which","are very","sensitive and not appreciably temperature @-@ dependent","."]} diff --git a/derivation_to_json/example.2.output b/derivation_to_json/example.2.output new file mode 100644 index 0000000..cd3ed15 --- /dev/null +++ b/derivation_to_json/example.2.output @@ -0,0 +1 @@ +the invention relates to gas sensors which are based on metal @-@ oxide semiconductors and which are very sensitive and not appreciably temperature @-@ dependent . diff --git a/derivation_to_json/example.2.raw b/derivation_to_json/example.2.raw new file mode 100644 index 0000000..7fed320 --- /dev/null +++ b/derivation_to_json/example.2.raw @@ -0,0 +1,10 @@ +({<0,21> [Goal] ||| [S] ||| [1]}({<0,21> [S] ||| [S] [X] ||| [1] [2]}({<0,13> [S] ||| [X] ||| [1]}({<0,13> [X] ||| [X] metalloxid @-@ halbleitern , ||| [1] @-@ oxide semiconductors and which}({<0,9> [X] ||| die [X] auf basis von ||| the [1] which are based on metal}({<1,6> [X] ||| [X] bezieht sich auf gassensoren ||| [1] relates to gas sensors}({<1,2> [X] ||| neuerung ||| invention}) ) ) ) ) ({<13,21> [X] ||| die sehr [X] sind . ||| are very [1] .}({<15,19> [X] ||| empfindlich und wenig temperaturabhängig ||| sensitive and not appreciably temperature @-@ dependent}) ) ) ) +[X] ||| neuerung ||| invention ||| ForceRule=1 ||| 0-0 +[X] ||| [X] bezieht sich auf gassensoren ||| [1] relates to gas sensors ||| IsSupportedOnline=0 IsSingletonFE=0 IsSingletonF=0 MaxLexEgivenF=1.45124 MaxLexFgivenE=2.73473 CountEF=0.477121 SampleCountF=0.477121 EgivenFCoherent=-0 ||| 1-1 1-2 2-1 4-3 4-4 +[X] ||| die [X] auf basis von ||| the [1] which are based on metal ||| IsSupportedOnline=0 IsSingletonFE=1 IsSingletonF=1 MaxLexEgivenF=5.95916 MaxLexFgivenE=3.2265 CountEF=0.30103 SampleCountF=0.30103 EgivenFCoherent=-0 ||| 0-0 2-4 3-4 3-5 4-6 +[X] ||| [X] metalloxid @-@ halbleitern , ||| [1] @-@ oxide semiconductors and which ||| IsSupportedOnline=0 IsSingletonFE=1 IsSingletonF=1 MaxLexEgivenF=3.79715 MaxLexFgivenE=2.26688 CountEF=0.30103 SampleCountF=0.30103 EgivenFCoherent=-0 ||| 1-2 2-1 3-3 4-5 +[S] ||| [X] ||| [1] +[X] ||| empfindlich und wenig temperaturabhängig ||| sensitive and not appreciably temperature @-@ dependent ||| IsSupportedOnline=0 IsSingletonFE=1 IsSingletonF=1 MaxLexEgivenF=6.5059 MaxLexFgivenE=1.51642 CountEF=0.30103 SampleCountF=0.30103 EgivenFCoherent=-0 ||| 0-0 1-1 2-3 3-3 3-4 3-6 +[X] ||| die sehr [X] sind . ||| are very [1] . ||| IsSupportedOnline=0 IsSingletonFE=1 IsSingletonF=1 MaxLexEgivenF=0.456219 MaxLexFgivenE=2.16613 CountEF=0.30103 SampleCountF=0.30103 EgivenFCoherent=-0 ||| 0-0 1-1 3-3 4-3 +[S] ||| [S] [X] ||| [1] [2] ||| Glue=1 +[Goal] ||| [S] ||| [1] diff --git a/derivation_to_json/example.3.json b/derivation_to_json/example.3.json new file mode 100644 index 0000000..cbfacb8 --- /dev/null +++ b/derivation_to_json/example.3.json @@ -0,0 +1 @@ +{"phrase_alignment":[[0],[1],[],[2,4],[3],[5],[6],[],[8],[9],[10]],"source_groups":["in einer","favorisierten","ausführung","dient einer","der schwenkbolzen ( 8 )","zugleich","als messbolzen ,","indem","an ihm dehnungsmess @-@ streifen","( 12",") angebracht sind ."],"target_groups":["in a","fuck that","of","the hinge bolts ( 8 )","is used","at the","time as","a measuring bolt ,","by having strain gauge strips","( 12",") attached to it ."]} diff --git a/derivation_to_json/example.3.output b/derivation_to_json/example.3.output new file mode 100644 index 0000000..cee1e65 --- /dev/null +++ b/derivation_to_json/example.3.output @@ -0,0 +1 @@ +in a fuck that of the hinge bolts ( 8 ) is used at the time as a measuring bolt , by having strain gauge strips ( 12 ) attached to it . diff --git a/derivation_to_json/example.3.raw b/derivation_to_json/example.3.raw new file mode 100644 index 0000000..66a5743 --- /dev/null +++ b/derivation_to_json/example.3.raw @@ -0,0 +1,13 @@ +({<0,27> [Goal] ||| [S] ||| [1]}({<0,27> [S] ||| [S] [X] ||| [1] [2]}({<0,12> [S] ||| [X] ||| [1]}({<0,12> [X] ||| in einer [X] ||| in a [1]}({<2,12> [X] ||| [X] ausführung [X] zugleich ||| [1] [2] at the}({<2,3> [X] ||| favorisierten ||| fuck that}) ({<4,11> [X] ||| dient einer [X] ||| of [1] is used}({<6,11> [X] ||| der schwenkbolzen ( 8 ) ||| the hinge bolts ( 8 )}) ) ) ) ) ({<12,27> [X] ||| [X] ) angebracht sind . ||| [1] ) attached to it .}({<12,23> [X] ||| [X] indem [X] ( 12 ||| [1] a measuring bolt , [2] ( 12}({<12,15> [X] ||| als messbolzen , ||| time as}) ({<16,21> [X] ||| an ihm dehnungsmess @-@ streifen ||| by having strain gauge strips}) ) ) ) ) +[X] ||| favorisierten ||| fuck that ||| ForceRule=1 ||| 0-0 0-1 +[X] ||| der schwenkbolzen ( 8 ) ||| the hinge bolts ( 8 ) ||| IsSupportedOnline=0 IsSingletonFE=1 IsSingletonF=1 MaxLexEgivenF=4.2972 MaxLexFgivenE=3.01678 CountEF=0.30103 SampleCountF=0.30103 EgivenFCoherent=-0 ||| 0-0 1-2 2-1 2-3 2-5 3-4 4-5 +[X] ||| dient einer [X] ||| of [1] is used ||| IsSupportedOnline=0 IsSingletonFE=1 IsSingletonF=1 MaxLexEgivenF=2.51875 MaxLexFgivenE=3.28665 CountEF=0.30103 SampleCountF=0.30103 EgivenFCoherent=-0 ||| 0-2 0-3 1-0 +[X] ||| [X] ausführung [X] zugleich ||| [1] [2] at the ||| IsSupportedOnline=0 IsSingletonFE=1 IsSingletonF=1 MaxLexEgivenF=2.87767 MaxLexFgivenE=7.30184 CountEF=0.30103 SampleCountF=0.30103 EgivenFCoherent=-0 ||| 3-3 +[X] ||| in einer [X] ||| in a [1] ||| IsSupportedOnline=0 IsSingletonFE=0 IsSingletonF=0 MaxLexEgivenF=0.605196 MaxLexFgivenE=1.51196 CountEF=0.69897 SampleCountF=1.63347 EgivenFCoherent=1.02119 ||| 0-0 1-1 +[S] ||| [X] ||| [1] +[X] ||| als messbolzen , ||| time as ||| IsSupportedOnline=0 IsSingletonFE=1 IsSingletonF=1 MaxLexEgivenF=0.328333 MaxLexFgivenE=2.60944 CountEF=0.30103 SampleCountF=0.30103 EgivenFCoherent=-0 ||| 0-1 1-0 2-0 +[X] ||| an ihm dehnungsmess @-@ streifen ||| by having strain gauge strips ||| IsSupportedOnline=0 IsSingletonFE=1 IsSingletonF=1 MaxLexEgivenF=7.23159 MaxLexFgivenE=6.6978 CountEF=0.30103 SampleCountF=0.30103 EgivenFCoherent=-0 ||| 0-4 2-2 3-0 4-3 +[X] ||| [X] indem [X] ( 12 ||| [1] a measuring bolt , [2] ( 12 ||| IsSupportedOnline=0 IsSingletonFE=1 IsSingletonF=1 MaxLexEgivenF=10.3295 MaxLexFgivenE=3.69621 CountEF=0.30103 SampleCountF=0.30103 EgivenFCoherent=-0 ||| 3-6 4-7 +[X] ||| [X] ) angebracht sind . ||| [1] ) attached to it . ||| IsSupportedOnline=0 IsSingletonFE=1 IsSingletonF=1 MaxLexEgivenF=5.52044 MaxLexFgivenE=3.17712 CountEF=0.30103 SampleCountF=0.30103 EgivenFCoherent=-0 ||| 1-1 2-2 3-3 4-5 +[S] ||| [S] [X] ||| [1] [2] ||| Glue=1 +[Goal] ||| [S] ||| [1] -- cgit v1.2.3