summaryrefslogtreecommitdiff
path: root/derivation_to_json
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2015-11-05 20:16:01 +0100
committerPatrick Simianer <p@simianer.de>2015-11-05 20:16:01 +0100
commit138aadb9fc2e868beece86743539634aa3664502 (patch)
tree49f547e19b5a91ad7a87f4f2ee97302299edb0ca /derivation_to_json
parent50fcce8314c07dc13d65da47c8fc6fdd16491495 (diff)
derivation_to_json: word alignment as hint for group alignment
Diffstat (limited to 'derivation_to_json')
-rw-r--r--derivation_to_json/README4
-rwxr-xr-xderivation_to_json/derivation_to_json.rb134
-rw-r--r--derivation_to_json/example.2.json1
-rw-r--r--derivation_to_json/example.2.output1
-rw-r--r--derivation_to_json/example.2.raw10
-rw-r--r--derivation_to_json/example.3.json1
-rw-r--r--derivation_to_json/example.3.output1
-rw-r--r--derivation_to_json/example.3.raw13
8 files changed, 156 insertions, 9 deletions
diff --git a/derivation_to_json/README b/derivation_to_json/README
index 9aa15a1..947cefd 100644
--- a/derivation_to_json/README
+++ b/derivation_to_json/README
@@ -1,2 +1,4 @@
-This hack reads cdec's "--show_derivations" into a an object.
+This (horrid) hack reads cdec's "--show_derivations" and "--extract_rules"
+into data structures and tries to align "groups" in source and target sides
+of rules in a smart, presentable way.
diff --git a/derivation_to_json/derivation_to_json.rb b/derivation_to_json/derivation_to_json.rb
index f4706f3..afd62b7 100755
--- a/derivation_to_json/derivation_to_json.rb
+++ b/derivation_to_json/derivation_to_json.rb
@@ -1,9 +1,10 @@
#!/usr/bin/env ruby
require 'zipf'
+require 'stringio'
class RuleAndSpan
- attr_accessor :span, :symbol, :source, :target, :subspans, :done, :id
+ attr_accessor :span, :symbol, :source, :target, :subspans, :done, :id, :trule
def initialize s, id
spans, srcs, tgts = splitpipe s.strip
@@ -29,17 +30,112 @@ class RuleAndSpan
return false
end
+
+ def match_with_rule r
+ if @source.join(" ").gsub(/\[\d+\]/, "[X]")==r.f \
+ && @target.join(" ").gsub(/\[\d+\]/, "[X]")==r.e
+ return true
+ end
+
+ return false
+ end
+end
+
+class Rule
+ attr_accessor :nt, :f, :e, :v, :a, :ha, :source_groups, :target_groups
+
+ def initialize s
+ splitpipe(s).each_with_index { |i,j|
+ i = i.strip.lstrip
+ if j == 0 # NT
+ @nt = i
+ elsif j == 1 # french
+ @f = i.gsub(/\[\d+\]/, "[X]")
+ @fa = @f.split
+ @source_groups = @f.split("[X]").map{|i|i.strip.lstrip}
+ @source_groups.reject! { |i| i=="" }
+ elsif j == 2 # english
+ @e = i.gsub(/\[\d+\]/, "[X]")
+ @ea = @e.split
+ @target_groups = @e.split("[X]").map{|i|i.strip.lstrip}
+ @target_groups.reject! { |i| i=="" }
+ elsif j == 3 # vector
+ @v = i
+ elsif j == 4 # alignment
+ @a = i
+ @ha = {}
+ @a.split.each { |i|
+ x,y = i.split("-")
+ x = x.to_i
+ y = y.to_i
+ rx = 0
+ (0).upto(x-1) { |k|
+ if @fa[k].match /\[X\]/
+ rx += 1
+ end
+ }
+ ry = 0
+ (0).upto(y-1) { |k|
+ if @ea[k].match /\[X\]/
+ ry += 1
+ end
+ }
+ x -= rx
+ y -= ry
+ if @ha[x]
+ @ha[x] << y
+ else
+ @ha[x] = [y]
+ end
+ }
+ else # error
+ end
+ }
+ end
+
+ def group_has_link ngroup_source, ngroup_target
+ offset_source = 0
+ (0).upto(ngroup_source-1) { |i|
+ offset_source += @source_groups[i].split.size
+ }
+ offset_target = 0
+ (0).upto(ngroup_target-1) { |i|
+ offset_target += @target_groups[i].split.size
+ }
+ (offset_source).upto(-1+offset_source+@source_groups[ngroup_source].split.size) { |i|
+ next if !@ha[i]
+ @ha[i].each { |k|
+ if (offset_target..(-1+offset_target+@target_groups[ngroup_target].split.size)).include? k
+ return true
+ end
+ }
+ }
+
+ return false
+ end
+
+ def to_s
+ "#{@nt} ||| #{@f} ||| #{@e} ||| #{@v} ||| #{@a}\n"
+ end
end
def conv_cdec_show_deriv s
- a = s.split("}").map { |i|
+ rules = []
+ xx = StringIO.new s
+ d_s = xx.gets
+ while line = xx.gets
+ r = Rule.new(line)
+ rules << r
+ end
+
+ a = d_s.split("}").map { |i|
i.gsub /^[()\s]*/, ""
}.reject { |i|
i.size==0 }.map { |i|
i.gsub /^\{/, ""
}
- return a
+ return a, rules
end
def derive span, spans, by_span, o, groups, source
@@ -63,7 +159,7 @@ def derive span, spans, by_span, o, groups, source
end
}
else
- groups.last << ["#{w}", span.id]
+ groups.last << ["#{w}", span.id, span.trule]
o << w
end
}
@@ -71,13 +167,18 @@ def derive span, spans, by_span, o, groups, source
end
def proc_deriv s
- a = conv_cdec_show_deriv s
+ a, rules = conv_cdec_show_deriv s
by_span = {}
spans = []
id = 0
a.each { |line|
rs = RuleAndSpan.new line, id
+ rules.each { |r|
+ if rs.match_with_rule r
+ rs.trule = r
+ end
+ }
id += 1
by_span[rs.span] = rs
if rs.is_terminal_rule?
@@ -120,15 +221,28 @@ def proc_deriv s
rgroups = []
source_groups.each { |i| source_rgroups << i.first[1] }
groups.each { |i| rgroups << i.first[1] }
+ rules_by_span_id = {}
+ source_groups.each { |i|
+ rules_by_span_id[i.first[1]] = i.first[2]
+ }
phrase_align = []
- source_rgroups.each { |i|
+ count_source = {}
+ count_target = {}
+ count_source.default = 0
+ count_target.default = 0
+ source_rgroups.each_with_index { |i|
phrase_align << []
rgroups.each_with_index { |j,k|
if i==j
- phrase_align.last << k
+ if rules_by_span_id[i].group_has_link count_source[i], count_target[j]
+ phrase_align.last << k
+ end
+ count_target[j] += 1
end
}
+ count_source[i] += 1
+ count_target.clear
}
h = {}
@@ -140,7 +254,11 @@ def proc_deriv s
end
if __FILE__ == $0
- json = proc_deriv(STDIN.gets.strip)
+ s = ""
+ while line = STDIN.gets
+ s += line
+ end
+ json = proc_deriv(s)
obj = JSON.parse(json)
STDERR.write "#{json}\n"
puts obj["target_groups"].join " "
diff --git a/derivation_to_json/example.2.json b/derivation_to_json/example.2.json
new file mode 100644
index 0000000..e3e05fd
--- /dev/null
+++ b/derivation_to_json/example.2.json
@@ -0,0 +1 @@
+{"phrase_alignment":[[0],[1],[2],[3],[4],[5],[6],[7]],"source_groups":["die","neuerung","bezieht sich auf gassensoren","auf basis von","metalloxid @-@ halbleitern ,","die sehr","empfindlich und wenig temperaturabhängig","sind ."],"target_groups":["the","invention","relates to gas sensors","which are based on metal","@-@ oxide semiconductors and which","are very","sensitive and not appreciably temperature @-@ dependent","."]}
diff --git a/derivation_to_json/example.2.output b/derivation_to_json/example.2.output
new file mode 100644
index 0000000..cd3ed15
--- /dev/null
+++ b/derivation_to_json/example.2.output
@@ -0,0 +1 @@
+the invention relates to gas sensors which are based on metal @-@ oxide semiconductors and which are very sensitive and not appreciably temperature @-@ dependent .
diff --git a/derivation_to_json/example.2.raw b/derivation_to_json/example.2.raw
new file mode 100644
index 0000000..7fed320
--- /dev/null
+++ b/derivation_to_json/example.2.raw
@@ -0,0 +1,10 @@
+({<0,21> [Goal] ||| [S] ||| [1]}({<0,21> [S] ||| [S] [X] ||| [1] [2]}({<0,13> [S] ||| [X] ||| [1]}({<0,13> [X] ||| [X] metalloxid @-@ halbleitern , ||| [1] @-@ oxide semiconductors and which}({<0,9> [X] ||| die [X] auf basis von ||| the [1] which are based on metal}({<1,6> [X] ||| [X] bezieht sich auf gassensoren ||| [1] relates to gas sensors}({<1,2> [X] ||| neuerung ||| invention}) ) ) ) ) ({<13,21> [X] ||| die sehr [X] sind . ||| are very [1] .}({<15,19> [X] ||| empfindlich und wenig temperaturabhängig ||| sensitive and not appreciably temperature @-@ dependent}) ) ) )
+[X] ||| neuerung ||| invention ||| ForceRule=1 ||| 0-0
+[X] ||| [X] bezieht sich auf gassensoren ||| [1] relates to gas sensors ||| IsSupportedOnline=0 IsSingletonFE=0 IsSingletonF=0 MaxLexEgivenF=1.45124 MaxLexFgivenE=2.73473 CountEF=0.477121 SampleCountF=0.477121 EgivenFCoherent=-0 ||| 1-1 1-2 2-1 4-3 4-4
+[X] ||| die [X] auf basis von ||| the [1] which are based on metal ||| IsSupportedOnline=0 IsSingletonFE=1 IsSingletonF=1 MaxLexEgivenF=5.95916 MaxLexFgivenE=3.2265 CountEF=0.30103 SampleCountF=0.30103 EgivenFCoherent=-0 ||| 0-0 2-4 3-4 3-5 4-6
+[X] ||| [X] metalloxid @-@ halbleitern , ||| [1] @-@ oxide semiconductors and which ||| IsSupportedOnline=0 IsSingletonFE=1 IsSingletonF=1 MaxLexEgivenF=3.79715 MaxLexFgivenE=2.26688 CountEF=0.30103 SampleCountF=0.30103 EgivenFCoherent=-0 ||| 1-2 2-1 3-3 4-5
+[S] ||| [X] ||| [1]
+[X] ||| empfindlich und wenig temperaturabhängig ||| sensitive and not appreciably temperature @-@ dependent ||| IsSupportedOnline=0 IsSingletonFE=1 IsSingletonF=1 MaxLexEgivenF=6.5059 MaxLexFgivenE=1.51642 CountEF=0.30103 SampleCountF=0.30103 EgivenFCoherent=-0 ||| 0-0 1-1 2-3 3-3 3-4 3-6
+[X] ||| die sehr [X] sind . ||| are very [1] . ||| IsSupportedOnline=0 IsSingletonFE=1 IsSingletonF=1 MaxLexEgivenF=0.456219 MaxLexFgivenE=2.16613 CountEF=0.30103 SampleCountF=0.30103 EgivenFCoherent=-0 ||| 0-0 1-1 3-3 4-3
+[S] ||| [S] [X] ||| [1] [2] ||| Glue=1
+[Goal] ||| [S] ||| [1]
diff --git a/derivation_to_json/example.3.json b/derivation_to_json/example.3.json
new file mode 100644
index 0000000..cbfacb8
--- /dev/null
+++ b/derivation_to_json/example.3.json
@@ -0,0 +1 @@
+{"phrase_alignment":[[0],[1],[],[2,4],[3],[5],[6],[],[8],[9],[10]],"source_groups":["in einer","favorisierten","ausführung","dient einer","der schwenkbolzen ( 8 )","zugleich","als messbolzen ,","indem","an ihm dehnungsmess @-@ streifen","( 12",") angebracht sind ."],"target_groups":["in a","fuck that","of","the hinge bolts ( 8 )","is used","at the","time as","a measuring bolt ,","by having strain gauge strips","( 12",") attached to it ."]}
diff --git a/derivation_to_json/example.3.output b/derivation_to_json/example.3.output
new file mode 100644
index 0000000..cee1e65
--- /dev/null
+++ b/derivation_to_json/example.3.output
@@ -0,0 +1 @@
+in a fuck that of the hinge bolts ( 8 ) is used at the time as a measuring bolt , by having strain gauge strips ( 12 ) attached to it .
diff --git a/derivation_to_json/example.3.raw b/derivation_to_json/example.3.raw
new file mode 100644
index 0000000..66a5743
--- /dev/null
+++ b/derivation_to_json/example.3.raw
@@ -0,0 +1,13 @@
+({<0,27> [Goal] ||| [S] ||| [1]}({<0,27> [S] ||| [S] [X] ||| [1] [2]}({<0,12> [S] ||| [X] ||| [1]}({<0,12> [X] ||| in einer [X] ||| in a [1]}({<2,12> [X] ||| [X] ausführung [X] zugleich ||| [1] [2] at the}({<2,3> [X] ||| favorisierten ||| fuck that}) ({<4,11> [X] ||| dient einer [X] ||| of [1] is used}({<6,11> [X] ||| der schwenkbolzen ( 8 ) ||| the hinge bolts ( 8 )}) ) ) ) ) ({<12,27> [X] ||| [X] ) angebracht sind . ||| [1] ) attached to it .}({<12,23> [X] ||| [X] indem [X] ( 12 ||| [1] a measuring bolt , [2] ( 12}({<12,15> [X] ||| als messbolzen , ||| time as}) ({<16,21> [X] ||| an ihm dehnungsmess @-@ streifen ||| by having strain gauge strips}) ) ) ) )
+[X] ||| favorisierten ||| fuck that ||| ForceRule=1 ||| 0-0 0-1
+[X] ||| der schwenkbolzen ( 8 ) ||| the hinge bolts ( 8 ) ||| IsSupportedOnline=0 IsSingletonFE=1 IsSingletonF=1 MaxLexEgivenF=4.2972 MaxLexFgivenE=3.01678 CountEF=0.30103 SampleCountF=0.30103 EgivenFCoherent=-0 ||| 0-0 1-2 2-1 2-3 2-5 3-4 4-5
+[X] ||| dient einer [X] ||| of [1] is used ||| IsSupportedOnline=0 IsSingletonFE=1 IsSingletonF=1 MaxLexEgivenF=2.51875 MaxLexFgivenE=3.28665 CountEF=0.30103 SampleCountF=0.30103 EgivenFCoherent=-0 ||| 0-2 0-3 1-0
+[X] ||| [X] ausführung [X] zugleich ||| [1] [2] at the ||| IsSupportedOnline=0 IsSingletonFE=1 IsSingletonF=1 MaxLexEgivenF=2.87767 MaxLexFgivenE=7.30184 CountEF=0.30103 SampleCountF=0.30103 EgivenFCoherent=-0 ||| 3-3
+[X] ||| in einer [X] ||| in a [1] ||| IsSupportedOnline=0 IsSingletonFE=0 IsSingletonF=0 MaxLexEgivenF=0.605196 MaxLexFgivenE=1.51196 CountEF=0.69897 SampleCountF=1.63347 EgivenFCoherent=1.02119 ||| 0-0 1-1
+[S] ||| [X] ||| [1]
+[X] ||| als messbolzen , ||| time as ||| IsSupportedOnline=0 IsSingletonFE=1 IsSingletonF=1 MaxLexEgivenF=0.328333 MaxLexFgivenE=2.60944 CountEF=0.30103 SampleCountF=0.30103 EgivenFCoherent=-0 ||| 0-1 1-0 2-0
+[X] ||| an ihm dehnungsmess @-@ streifen ||| by having strain gauge strips ||| IsSupportedOnline=0 IsSingletonFE=1 IsSingletonF=1 MaxLexEgivenF=7.23159 MaxLexFgivenE=6.6978 CountEF=0.30103 SampleCountF=0.30103 EgivenFCoherent=-0 ||| 0-4 2-2 3-0 4-3
+[X] ||| [X] indem [X] ( 12 ||| [1] a measuring bolt , [2] ( 12 ||| IsSupportedOnline=0 IsSingletonFE=1 IsSingletonF=1 MaxLexEgivenF=10.3295 MaxLexFgivenE=3.69621 CountEF=0.30103 SampleCountF=0.30103 EgivenFCoherent=-0 ||| 3-6 4-7
+[X] ||| [X] ) angebracht sind . ||| [1] ) attached to it . ||| IsSupportedOnline=0 IsSingletonFE=1 IsSingletonF=1 MaxLexEgivenF=5.52044 MaxLexFgivenE=3.17712 CountEF=0.30103 SampleCountF=0.30103 EgivenFCoherent=-0 ||| 1-1 2-2 3-3 4-5
+[S] ||| [S] [X] ||| [1] [2] ||| Glue=1
+[Goal] ||| [S] ||| [1]