diff options
author | Patrick Simianer <p@simianer.de> | 2015-11-11 16:10:58 +0100 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2015-11-11 16:10:58 +0100 |
commit | 25674bcbde962f6fc27448af147b88b853a168f7 (patch) | |
tree | 4e95781a163c7790a10bb3a1af0982c2d42a7a89 /derivation_to_json | |
parent | fbf4cb550ebdcefc4552167e5c6938a5fce2b86d (diff) |
extract rules from post-edit alignment
Diffstat (limited to 'derivation_to_json')
-rw-r--r-- | derivation_to_json/after.json | 53 | ||||
-rw-r--r-- | derivation_to_json/before.json | 127 | ||||
-rwxr-xr-x | derivation_to_json/rec.rb | 50 | ||||
-rwxr-xr-x | derivation_to_json/rules.rb | 42 |
4 files changed, 272 insertions, 0 deletions
diff --git a/derivation_to_json/after.json b/derivation_to_json/after.json new file mode 100644 index 0000000..fb58467 --- /dev/null +++ b/derivation_to_json/after.json @@ -0,0 +1,53 @@ +{ + "source": [ + "Weiterhin gehört", + "zur Erfindung", + "die Verwendung dieser Zusammensetzungen zur", + "Therapie und Prophylaxe von", + "Herz-Kreislauf-Erkrankungen", + ", Erkrankungen", + "im Zusammenhang", + "mit einer erhöhten", + "Thrombozytenaggregation,", + "Stoffwechsel-Erkrankungen", + ", Knochenerkrankungen", + "oder", + "Krebserkrankungen", + "." + ], + "target": [ + "Additionally,", + "the invention relates to", + "the use of said compositions for the", + "therapy and prophylaxis of", + "cardiovascular diseases", + ", diseases", + "in conjunction", + "with an increased", + "platelet aggregation,", + "\tmetabolic diseases", + ", osteopathy", + "or", + "cancerous diseases", + "." + ], + "align": [ + "0-0", + "1-1", + "2-2", + "3-3", + "4-4", + "5-5", + "6-6", + "7-7", + "8-8", + "9-9", + "10-10", + "11-11", + "12-12", + "13-13" + ], + "post_edit": "Additionally, the invention relates to the use of said compositions for the therapy and prophylaxis of cardiovascular diseases , diseases in conjunction with an increased platelet aggregation, metabolic diseases , osteopathy or cancerous diseases .", + "duration": 212272, + "source_value": "weiterhin gehört zur Erfindung die Verwendung dieser Zusammensetzungen zur Therapie und Prophylaxe von Herz-Kreislauf-Erkrankungen , Erkrankungen im Zusammenhang mit einer erhöhten Thrombozytenaggregation , Stoffwechsel-Erkrankungen , Knochenerkrankungen oder Krebserkrankungen ." +} diff --git a/derivation_to_json/before.json b/derivation_to_json/before.json new file mode 100644 index 0000000..1d2c911 --- /dev/null +++ b/derivation_to_json/before.json @@ -0,0 +1,127 @@ +{ + "phrase_alignment": [ + [ + 1 + ], + [ + 2, + 0 + ], + [ + 3 + ], + [ + 4 + ], + [ + 5 + ], + [ + 6 + ], + [ + 7 + ], + [ + 8 + ], + [ + 9 + ], + [ + 10 + ], + [ + 11 + ], + [ + 12 + ], + [ + 13 + ], + [ + 14 + ] + ], + "source_rgroups": [ + 4, + 3, + 7, + 6, + 5, + 8, + 9, + 11, + 12, + 13, + 10, + 9, + 8, + 14 + ], + "target_rgroups": [ + 3, + 4, + 3, + 7, + 6, + 5, + 8, + 9, + 11, + 12, + 13, + 10, + 9, + 8, + 14 + ], + "rules_by_span_id": { + "4": "[X] ||| weiterhin gehört ||| also ||| IsSupportedOnline=0 IsSingletonFE=1 IsSingletonF=1 MaxLexEgivenF=0.458975 MaxLexFgivenE=4.79441 CountEF=0.30103 SampleCountF=0.477121 EgivenFCoherent=0.30103 ||| 0-0 1-0\n", + "3": "[X] ||| [X] zur Erfindung [X] ||| the invention [X] relates to [X] ||| IsSupportedOnline=0 IsSingletonFE=1 IsSingletonF=0 MaxLexEgivenF=3.63316 MaxLexFgivenE=1.80404 CountEF=0.30103 SampleCountF=1.27875 EgivenFCoherent=1.25527 ||| 1-4 2-3\n", + "7": "[X] ||| die Verwendung dieser Zusammensetzungen zur ||| the use of said compositions for the ||| IsSupportedOnline=0 IsSingletonFE=1 IsSingletonF=1 MaxLexEgivenF=2.93053 MaxLexFgivenE=3.26928 CountEF=0.30103 SampleCountF=0.69897 EgivenFCoherent=0.60206 ||| 0-0 1-1 2-3 3-4 4-5\n", + "6": "[X] ||| [X] Therapie und Prophylaxe von ||| [X] therapy and prophylaxis of ||| IsSupportedOnline=0 IsSingletonFE=0 IsSingletonF=0 MaxLexEgivenF=0.926982 MaxLexFgivenE=1.42237 CountEF=0.90309 SampleCountF=1.47712 EgivenFCoherent=0.6173 ||| 1-1 2-2 3-3 4-4\n", + "5": "[X] ||| [X] Herz-Kreislauf-Erkrankungen ||| [X] cardiovascular diseases ||| IsSupportedOnline=0 IsSingletonFE=0 IsSingletonF=0 MaxLexEgivenF=0.739399 MaxLexFgivenE=0.797149 CountEF=1.38021 SampleCountF=1.69897 EgivenFCoherent=0.328468 ||| 1-1 1-2\n", + "8": "[X] ||| , Erkrankungen [X] Krebserkrankungen [X] ||| , diseases [X] cancerous diseases [X] ||| IsSupportedOnline=0 IsSingletonFE=0 IsSingletonF=0 MaxLexEgivenF=1.3856 MaxLexFgivenE=0.862494 CountEF=0.477121 SampleCountF=0.778151 EgivenFCoherent=0.39794 ||| 0-0 1-1 3-3 3-4\n", + "9": "[X] ||| im Zusammenhang [X] oder ||| in conjunction [X] or ||| IsSupportedOnline=0 IsSingletonFE=0 IsSingletonF=0 MaxLexEgivenF=1.43987 MaxLexFgivenE=2.45332 CountEF=0.778151 SampleCountF=1.44716 EgivenFCoherent=0.732394 ||| 0-0 1-1 3-3\n", + "11": "[X] ||| mit einer erhöhten [X] ||| with an increased [X] ||| IsSupportedOnline=0 IsSingletonFE=0 IsSingletonF=0 MaxLexEgivenF=1.72017 MaxLexFgivenE=2.04771 CountEF=0.954243 SampleCountF=1.6902 EgivenFCoherent=0.778151 ||| 0-0 1-1 2-2\n", + "12": "[X] ||| Thrombozytenaggregation , [X] ||| platelet aggregation [X] ||| IsSupportedOnline=0 IsSingletonFE=0 IsSingletonF=0 MaxLexEgivenF=0.886057 MaxLexFgivenE=1.98654 CountEF=0.477121 SampleCountF=0.69897 EgivenFCoherent=0.30103 ||| 0-0 0-1\n", + "13": "[X] ||| Stoffwechsel-Erkrankungen ||| asdf ||| ForceRule=1 ||| 0-0\n", + "10": "[X] ||| [X] , Knochenerkrankungen ||| [X] , osteopathy ||| IsSupportedOnline=0 IsSingletonFE=1 IsSingletonF=1 MaxLexEgivenF=1.11792 MaxLexFgivenE=0.186321 CountEF=0.30103 SampleCountF=0.30103 EgivenFCoherent=-0 ||| 1-1 2-2\n", + "14": "[X] ||| . ||| . ||| IsSupportedOnline=1 IsSingletonFE=0 IsSingletonF=0 MaxLexEgivenF=0.0201086 MaxLexFgivenE=0.135104 CountEF=2.39967 SampleCountF=2.48287 EgivenFCoherent=0.0835026 ||| 0-0\n" + }, + "source_groups": [ + "weiterhin gehört", + "zur Erfindung", + "die Verwendung dieser Zusammensetzungen zur", + "Therapie und Prophylaxe von", + "Herz-Kreislauf-Erkrankungen", + ", Erkrankungen", + "im Zusammenhang", + "mit einer erhöhten", + "Thrombozytenaggregation ,", + "Stoffwechsel-Erkrankungen", + ", Knochenerkrankungen", + "oder", + "Krebserkrankungen", + "." + ], + "target_groups": [ + "the invention", + "also", + "relates to", + "the use of said compositions for the", + "therapy and prophylaxis of", + "cardiovascular diseases", + ", diseases", + "in conjunction", + "with an increased", + "platelet aggregation", + "asdf", + ", osteopathy", + "or", + "cancerous diseases", + "." + ] +} diff --git a/derivation_to_json/rec.rb b/derivation_to_json/rec.rb new file mode 100755 index 0000000..677a02a --- /dev/null +++ b/derivation_to_json/rec.rb @@ -0,0 +1,50 @@ +#!/usr/bin/env ruby + +require 'json' +require 'zipf' + + +before = JSON.parse(ReadFile.read('x.json')) +after = JSON.parse(ReadFile.read('y.json')) + +alignment = {} +after["align"].each { |i| + a,b = i.split '-' + a = a.to_i + b = b.to_i + if alignment[a] + alignment[a] << b + else + alignment[a] = [b] + end +} + +srg2idx = {} +before['source_rgroups'].uniq.each { |k| + srg2idx[k] = [] + before['source_rgroups'].each_with_index { |i,j| + if i==k + srg2idx[k] << j + end + } +} + +srg2idx.each_pair { |k,v| + a = [] + tgt = [] + v.each { |i| + a << after["source"][i] + tgt << after["target"][alignment[i].first] + } + rule_before = before['rules_by_span_id'][k.to_s] + src_side_before = splitpipe(rule_before)[1] + x = src_side_before.split + a.first.insert(0, " [X] ") if x[0] == "[X]" + a[a.size-1] += " [X] " if x[x.size-1] == "[X]" + puts rule_before + puts "#{k} #{a.join " [X] "}" + puts tgt.to_s + puts "---" + puts +} + diff --git a/derivation_to_json/rules.rb b/derivation_to_json/rules.rb new file mode 100755 index 0000000..b0d267b --- /dev/null +++ b/derivation_to_json/rules.rb @@ -0,0 +1,42 @@ +#!/usr/bin/env ruby + +require 'zipf' + +src = ['Synergistische', 'pharmazeutische Zusammensetzung enthaltend', 'ein Peptid', 'mit 2 bis 5', 'Aminosaeuren'] +target = ["A", "synergistic", "pharmaceutical composition containing", "a peptide", "with 2 to 5", "amino acis"] +align = [[1], [2], [0,3], [4], [5]] + + +def single_nt a + r = [] + r << a + max_sz = a.size-2 + if max_sz<0 + return r + end + a.each_index { |i| + b = Array.new a + b[i] = "[X]" + r << b + c = Array.new b + (1).upto(a.size-(i+1)) { |k| + c = Array.new c + c.delete_at(i+1) + break if c.size<2 + r << c + } + } + + return r +end + +src.each_with_index { |i,j| + src[j..src.size-1].each_with_index { |k,l| + sub = src[j..(j+l)] + r = single_nt sub + r.each { |i| + puts i.to_s + } + } +} + |