summaryrefslogtreecommitdiff
path: root/derivation_to_json
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2015-11-11 16:10:58 +0100
committerPatrick Simianer <p@simianer.de>2015-11-11 16:10:58 +0100
commit25674bcbde962f6fc27448af147b88b853a168f7 (patch)
tree4e95781a163c7790a10bb3a1af0982c2d42a7a89 /derivation_to_json
parentfbf4cb550ebdcefc4552167e5c6938a5fce2b86d (diff)
extract rules from post-edit alignment
Diffstat (limited to 'derivation_to_json')
-rw-r--r--derivation_to_json/after.json53
-rw-r--r--derivation_to_json/before.json127
-rwxr-xr-xderivation_to_json/rec.rb50
-rwxr-xr-xderivation_to_json/rules.rb42
4 files changed, 272 insertions, 0 deletions
diff --git a/derivation_to_json/after.json b/derivation_to_json/after.json
new file mode 100644
index 0000000..fb58467
--- /dev/null
+++ b/derivation_to_json/after.json
@@ -0,0 +1,53 @@
+{
+ "source": [
+ "Weiterhin gehört",
+ "zur Erfindung",
+ "die Verwendung dieser Zusammensetzungen zur",
+ "Therapie und Prophylaxe von",
+ "Herz-Kreislauf-Erkrankungen",
+ ", Erkrankungen",
+ "im Zusammenhang",
+ "mit einer erhöhten",
+ "Thrombozytenaggregation,",
+ "Stoffwechsel-Erkrankungen",
+ ", Knochenerkrankungen",
+ "oder",
+ "Krebserkrankungen",
+ "."
+ ],
+ "target": [
+ "Additionally,",
+ "the invention relates to",
+ "the use of said compositions for the",
+ "therapy and prophylaxis of",
+ "cardiovascular diseases",
+ ", diseases",
+ "in conjunction",
+ "with an increased",
+ "platelet aggregation,",
+ "\tmetabolic diseases",
+ ", osteopathy",
+ "or",
+ "cancerous diseases",
+ "."
+ ],
+ "align": [
+ "0-0",
+ "1-1",
+ "2-2",
+ "3-3",
+ "4-4",
+ "5-5",
+ "6-6",
+ "7-7",
+ "8-8",
+ "9-9",
+ "10-10",
+ "11-11",
+ "12-12",
+ "13-13"
+ ],
+ "post_edit": "Additionally, the invention relates to the use of said compositions for the therapy and prophylaxis of cardiovascular diseases , diseases in conjunction with an increased platelet aggregation, metabolic diseases , osteopathy or cancerous diseases .",
+ "duration": 212272,
+ "source_value": "weiterhin gehört zur Erfindung die Verwendung dieser Zusammensetzungen zur Therapie und Prophylaxe von Herz-Kreislauf-Erkrankungen , Erkrankungen im Zusammenhang mit einer erhöhten Thrombozytenaggregation , Stoffwechsel-Erkrankungen , Knochenerkrankungen oder Krebserkrankungen ."
+}
diff --git a/derivation_to_json/before.json b/derivation_to_json/before.json
new file mode 100644
index 0000000..1d2c911
--- /dev/null
+++ b/derivation_to_json/before.json
@@ -0,0 +1,127 @@
+{
+ "phrase_alignment": [
+ [
+ 1
+ ],
+ [
+ 2,
+ 0
+ ],
+ [
+ 3
+ ],
+ [
+ 4
+ ],
+ [
+ 5
+ ],
+ [
+ 6
+ ],
+ [
+ 7
+ ],
+ [
+ 8
+ ],
+ [
+ 9
+ ],
+ [
+ 10
+ ],
+ [
+ 11
+ ],
+ [
+ 12
+ ],
+ [
+ 13
+ ],
+ [
+ 14
+ ]
+ ],
+ "source_rgroups": [
+ 4,
+ 3,
+ 7,
+ 6,
+ 5,
+ 8,
+ 9,
+ 11,
+ 12,
+ 13,
+ 10,
+ 9,
+ 8,
+ 14
+ ],
+ "target_rgroups": [
+ 3,
+ 4,
+ 3,
+ 7,
+ 6,
+ 5,
+ 8,
+ 9,
+ 11,
+ 12,
+ 13,
+ 10,
+ 9,
+ 8,
+ 14
+ ],
+ "rules_by_span_id": {
+ "4": "[X] ||| weiterhin gehört ||| also ||| IsSupportedOnline=0 IsSingletonFE=1 IsSingletonF=1 MaxLexEgivenF=0.458975 MaxLexFgivenE=4.79441 CountEF=0.30103 SampleCountF=0.477121 EgivenFCoherent=0.30103 ||| 0-0 1-0\n",
+ "3": "[X] ||| [X] zur Erfindung [X] ||| the invention [X] relates to [X] ||| IsSupportedOnline=0 IsSingletonFE=1 IsSingletonF=0 MaxLexEgivenF=3.63316 MaxLexFgivenE=1.80404 CountEF=0.30103 SampleCountF=1.27875 EgivenFCoherent=1.25527 ||| 1-4 2-3\n",
+ "7": "[X] ||| die Verwendung dieser Zusammensetzungen zur ||| the use of said compositions for the ||| IsSupportedOnline=0 IsSingletonFE=1 IsSingletonF=1 MaxLexEgivenF=2.93053 MaxLexFgivenE=3.26928 CountEF=0.30103 SampleCountF=0.69897 EgivenFCoherent=0.60206 ||| 0-0 1-1 2-3 3-4 4-5\n",
+ "6": "[X] ||| [X] Therapie und Prophylaxe von ||| [X] therapy and prophylaxis of ||| IsSupportedOnline=0 IsSingletonFE=0 IsSingletonF=0 MaxLexEgivenF=0.926982 MaxLexFgivenE=1.42237 CountEF=0.90309 SampleCountF=1.47712 EgivenFCoherent=0.6173 ||| 1-1 2-2 3-3 4-4\n",
+ "5": "[X] ||| [X] Herz-Kreislauf-Erkrankungen ||| [X] cardiovascular diseases ||| IsSupportedOnline=0 IsSingletonFE=0 IsSingletonF=0 MaxLexEgivenF=0.739399 MaxLexFgivenE=0.797149 CountEF=1.38021 SampleCountF=1.69897 EgivenFCoherent=0.328468 ||| 1-1 1-2\n",
+ "8": "[X] ||| , Erkrankungen [X] Krebserkrankungen [X] ||| , diseases [X] cancerous diseases [X] ||| IsSupportedOnline=0 IsSingletonFE=0 IsSingletonF=0 MaxLexEgivenF=1.3856 MaxLexFgivenE=0.862494 CountEF=0.477121 SampleCountF=0.778151 EgivenFCoherent=0.39794 ||| 0-0 1-1 3-3 3-4\n",
+ "9": "[X] ||| im Zusammenhang [X] oder ||| in conjunction [X] or ||| IsSupportedOnline=0 IsSingletonFE=0 IsSingletonF=0 MaxLexEgivenF=1.43987 MaxLexFgivenE=2.45332 CountEF=0.778151 SampleCountF=1.44716 EgivenFCoherent=0.732394 ||| 0-0 1-1 3-3\n",
+ "11": "[X] ||| mit einer erhöhten [X] ||| with an increased [X] ||| IsSupportedOnline=0 IsSingletonFE=0 IsSingletonF=0 MaxLexEgivenF=1.72017 MaxLexFgivenE=2.04771 CountEF=0.954243 SampleCountF=1.6902 EgivenFCoherent=0.778151 ||| 0-0 1-1 2-2\n",
+ "12": "[X] ||| Thrombozytenaggregation , [X] ||| platelet aggregation [X] ||| IsSupportedOnline=0 IsSingletonFE=0 IsSingletonF=0 MaxLexEgivenF=0.886057 MaxLexFgivenE=1.98654 CountEF=0.477121 SampleCountF=0.69897 EgivenFCoherent=0.30103 ||| 0-0 0-1\n",
+ "13": "[X] ||| Stoffwechsel-Erkrankungen ||| asdf ||| ForceRule=1 ||| 0-0\n",
+ "10": "[X] ||| [X] , Knochenerkrankungen ||| [X] , osteopathy ||| IsSupportedOnline=0 IsSingletonFE=1 IsSingletonF=1 MaxLexEgivenF=1.11792 MaxLexFgivenE=0.186321 CountEF=0.30103 SampleCountF=0.30103 EgivenFCoherent=-0 ||| 1-1 2-2\n",
+ "14": "[X] ||| . ||| . ||| IsSupportedOnline=1 IsSingletonFE=0 IsSingletonF=0 MaxLexEgivenF=0.0201086 MaxLexFgivenE=0.135104 CountEF=2.39967 SampleCountF=2.48287 EgivenFCoherent=0.0835026 ||| 0-0\n"
+ },
+ "source_groups": [
+ "weiterhin gehört",
+ "zur Erfindung",
+ "die Verwendung dieser Zusammensetzungen zur",
+ "Therapie und Prophylaxe von",
+ "Herz-Kreislauf-Erkrankungen",
+ ", Erkrankungen",
+ "im Zusammenhang",
+ "mit einer erhöhten",
+ "Thrombozytenaggregation ,",
+ "Stoffwechsel-Erkrankungen",
+ ", Knochenerkrankungen",
+ "oder",
+ "Krebserkrankungen",
+ "."
+ ],
+ "target_groups": [
+ "the invention",
+ "also",
+ "relates to",
+ "the use of said compositions for the",
+ "therapy and prophylaxis of",
+ "cardiovascular diseases",
+ ", diseases",
+ "in conjunction",
+ "with an increased",
+ "platelet aggregation",
+ "asdf",
+ ", osteopathy",
+ "or",
+ "cancerous diseases",
+ "."
+ ]
+}
diff --git a/derivation_to_json/rec.rb b/derivation_to_json/rec.rb
new file mode 100755
index 0000000..677a02a
--- /dev/null
+++ b/derivation_to_json/rec.rb
@@ -0,0 +1,50 @@
+#!/usr/bin/env ruby
+
+require 'json'
+require 'zipf'
+
+
+before = JSON.parse(ReadFile.read('x.json'))
+after = JSON.parse(ReadFile.read('y.json'))
+
+alignment = {}
+after["align"].each { |i|
+ a,b = i.split '-'
+ a = a.to_i
+ b = b.to_i
+ if alignment[a]
+ alignment[a] << b
+ else
+ alignment[a] = [b]
+ end
+}
+
+srg2idx = {}
+before['source_rgroups'].uniq.each { |k|
+ srg2idx[k] = []
+ before['source_rgroups'].each_with_index { |i,j|
+ if i==k
+ srg2idx[k] << j
+ end
+ }
+}
+
+srg2idx.each_pair { |k,v|
+ a = []
+ tgt = []
+ v.each { |i|
+ a << after["source"][i]
+ tgt << after["target"][alignment[i].first]
+ }
+ rule_before = before['rules_by_span_id'][k.to_s]
+ src_side_before = splitpipe(rule_before)[1]
+ x = src_side_before.split
+ a.first.insert(0, " [X] ") if x[0] == "[X]"
+ a[a.size-1] += " [X] " if x[x.size-1] == "[X]"
+ puts rule_before
+ puts "#{k} #{a.join " [X] "}"
+ puts tgt.to_s
+ puts "---"
+ puts
+}
+
diff --git a/derivation_to_json/rules.rb b/derivation_to_json/rules.rb
new file mode 100755
index 0000000..b0d267b
--- /dev/null
+++ b/derivation_to_json/rules.rb
@@ -0,0 +1,42 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+
+src = ['Synergistische', 'pharmazeutische Zusammensetzung enthaltend', 'ein Peptid', 'mit 2 bis 5', 'Aminosaeuren']
+target = ["A", "synergistic", "pharmaceutical composition containing", "a peptide", "with 2 to 5", "amino acis"]
+align = [[1], [2], [0,3], [4], [5]]
+
+
+def single_nt a
+ r = []
+ r << a
+ max_sz = a.size-2
+ if max_sz<0
+ return r
+ end
+ a.each_index { |i|
+ b = Array.new a
+ b[i] = "[X]"
+ r << b
+ c = Array.new b
+ (1).upto(a.size-(i+1)) { |k|
+ c = Array.new c
+ c.delete_at(i+1)
+ break if c.size<2
+ r << c
+ }
+ }
+
+ return r
+end
+
+src.each_with_index { |i,j|
+ src[j..src.size-1].each_with_index { |k,l|
+ sub = src[j..(j+l)]
+ r = single_nt sub
+ r.each { |i|
+ puts i.to_s
+ }
+ }
+}
+