summaryrefslogtreecommitdiff
path: root/derivation_to_json
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2015-09-19 14:47:05 +0200
committerPatrick Simianer <p@simianer.de>2015-09-19 14:47:05 +0200
commit1576f31ff739288dc21c120ab8c04dbe5406cad7 (patch)
treee0b05fa1ce8f466f896b90cd89277578f89a2c69 /derivation_to_json
parent084f652eb431aafd36d0ea64dcd37d4eac7cd393 (diff)
code to convert cdec's output to json for derivation editor
Diffstat (limited to 'derivation_to_json')
-rwxr-xr-xderivation_to_json/deriv.rb151
-rw-r--r--derivation_to_json/example1
-rw-r--r--derivation_to_json/example.json1
3 files changed, 153 insertions, 0 deletions
diff --git a/derivation_to_json/deriv.rb b/derivation_to_json/deriv.rb
new file mode 100755
index 0000000..e6230c3
--- /dev/null
+++ b/derivation_to_json/deriv.rb
@@ -0,0 +1,151 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+
+def conv_cdec_show_deriv s
+ s.gsub! /\(/, "\n"
+ s.gsub! /[{}]/, ""
+ a = s.split "\n"
+ a.map! { |i|
+ i.strip.gsub /(\s*\))+$/, ""
+ }
+ a.reject! { |i| i.strip=="" }
+ return a
+end
+
+class RuleAndSpan
+ attr_accessor :span, :symbol, :source, :target, :subspans, :done, :id
+
+ def initialize s, id
+ spans, srcs, tgts = splitpipe s.strip
+ _,@symbol = spans.split
+ @span = _.split(",", 2).map { |i| i.gsub(/[<>]/, "").to_i}
+ @source = srcs.strip.split
+ j = 0
+ @source.map! { |i| if i.match(/\[X\]/) then "[#{j+=1}]" else i end }
+ @target = tgts.strip.split
+ @subspans = []
+ @done = false
+ @id = id
+ end
+
+ def to_s
+ return "#{@id}\t<#{@span.first},#{@span[1]}> [X] ||| #{@source.join " "} ||| #{@target.join " "}"
+ end
+
+ def is_terminal_rule?
+ @source.each { |w|
+ return true if !w.match(/\[S|(\d+)\]/)
+ }
+
+ return false
+ end
+end
+
+def proc_deriv s
+a = conv_cdec_show_deriv s
+
+by_span = {}
+spans = []
+id = 0
+a.each { |line|
+ rs = RuleAndSpan.new line, id
+ id += 1
+ by_span[rs.span] = rs
+ if rs.is_terminal_rule?
+ spans << rs.span
+ end
+}
+
+spans.reverse.each_with_index { |s,k|
+ (spans.size-(k+2)).downto(0) { |l|
+ t = spans[l]
+ if s[0] >= t[0] and s[1] <= t[1]
+ by_span[t].subspans << s
+ break
+ end
+ }
+}
+
+# fix order
+spans.each { |s|
+ by_span[s].subspans.reverse!
+}
+
+def derive span, spans, by_span, o, groups, source
+ if groups.size==0 || groups.last.size>0
+ groups << []
+ end
+ a = nil
+ if source
+ a = span.source
+ else
+ a = span.target
+ end
+ a.each_with_index { |w,k|
+ nt = w.match /\[(\d+)\]/
+ if nt
+ idx = nt.captures.first.to_i-1
+ _ = derive by_span[span.subspans[idx]], spans, by_span, o, groups, source
+ (k+1).upto(a.size-1) { |i|
+ if !a[i].match(/\[(\d+)\]/) && groups.last.size>0
+ groups << []
+ end
+ }
+ else
+ groups.last << ["#{w}", span.id]
+ o << w
+ end
+ }
+ span.done = true
+end
+
+so = []
+source_groups = []
+spans.each { |span|
+ next if by_span[span].done
+ derive by_span[span], spans, by_span, so, source_groups, true
+}
+#puts "SOURCE"
+#puts so.join " "
+#puts source_groups.to_s
+#puts "##{source_groups.size}"
+
+spans.each { |s| by_span[s].done = false }
+
+o = []
+groups = []
+spans.each { |span|
+ next if by_span[span].done
+ derive by_span[span], spans, by_span, o, groups, false
+}
+#puts "TARGET"
+#puts o.join " "
+#puts groups.to_s
+#puts "##{groups.size}"
+
+source_rgroups = []
+rgroups = []
+source_groups.each { |i| source_rgroups << i.first[1] }
+groups.each { |i| rgroups << i.first[1] }
+
+phrase_align = []
+source_rgroups.each { |i|
+ phrase_align << []
+ rgroups.each_with_index { |j,k|
+ if i==j
+ phrase_align.last << k
+ end
+ }
+}
+
+h = {}
+h[:phrase_alignment] = phrase_align
+h[:source_groups] = source_groups.map { |a| a.map { |i| i.first } }
+h[:target_groups] = groups.map { |a| a.map { |i| i.first } }
+
+return h.to_json
+end
+
+puts proc_deriv STDIN.gets.strip
+
diff --git a/derivation_to_json/example b/derivation_to_json/example
new file mode 100644
index 0000000..5128e09
--- /dev/null
+++ b/derivation_to_json/example
@@ -0,0 +1 @@
+({<0,41> [Goal] ||| [S] ||| [1]}({<0,41> [S] ||| [S] [X] ||| [1] [2]}({<0,37> [S] ||| [S] [X] ||| [1] [2]}({<0,33> [S] ||| [S] [X] ||| [1] [2]}({<0,32> [S] ||| [S] [X] ||| [1] [2]}({<0,30> [S] ||| [S] [X] ||| [1] [2]}({<0,15> [S] ||| [S] [X] ||| [1] [2]}({<0,10> [S] ||| [S] [X] ||| [1] [2]}({<0,9> [S] ||| [S] [X] ||| [1] [2]}({<0,8> [S] ||| [S] [X] ||| [1] [2]}({<0,7> [S] ||| [S] [X] ||| [1] [2]}({<0,5> [S] ||| [X] ||| [1]}({<0,5> [X] ||| hier also [X] , ||| so here [1] ,}({<2,4> [X] ||| ein bescheidener ||| a modest}) ) ) ({<5,7> [X] ||| auf alle ||| to all}) ) ({<7,8> [X] ||| demokratien ||| democracies}) ) ({<8,9> [X] ||| anzuwendender ||| anzuwendender}) ) ({<9,10> [X] ||| vorschlag ||| proposal}) ) ({<10,15> [X] ||| : [X] ideen ||| : [1] ideas}({<11,14> [X] ||| der markt für ||| the market for}) ) ) ({<15,30> [X] ||| funktioniert [X] , [X] der ||| works [1] [2] the}({<16,17> [X] ||| besser ||| better}) ({<18,29> [X] ||| wenn es den [X] ||| if [1]}({<21,29> [X] ||| [X] fällt , die [X] ||| [1] , the [2]}({<21,23> [X] ||| [X] leichter ||| [1] easier}({<21,22> [X] ||| bürgern ||| citizens}) ) ({<26,29> [X] ||| zielkonflikte zwischen [X] ||| trade @-@ offs between [1]}({<28,29> [X] ||| treffsicherheit ||| treffsicherheit}) ) ) ) ) ) ({<30,32> [X] ||| aussagen und ||| statements and}) ) ({<32,33> [X] ||| unterhaltung ||| entertainment}) ) ({<33,37> [X] ||| oder zwischen [X] und ||| or [1] and}({<35,36> [X] ||| treffsicherheit ||| treffsicherheit}) ) ) ({<37,41> [X] ||| [X] zu erkennen . ||| [1] .}({<37,38> [X] ||| parteitreue ||| parteitreue}) ) ) )
diff --git a/derivation_to_json/example.json b/derivation_to_json/example.json
new file mode 100644
index 0000000..b84edad
--- /dev/null
+++ b/derivation_to_json/example.json
@@ -0,0 +1 @@
+{"phrase_alignment":[[0,2],[1],[0,2],[3],[4],[5],[6],[7,9],[8],[7,9],[10,18],[11],[10,18],[12],[13],[14],[15],[16],[17],[10,18],[19],[20],[21,23],[22],[21,23],[24],[25]],"source_groups":[["hier","also"],["ein","bescheidener"],[","],["auf","alle"],["demokratien"],["anzuwendender"],["vorschlag"],[":"],["der","markt","für"],["ideen"],["funktioniert"],["besser"],[","],["wenn","es","den"],["bürgern"],["leichter"],["fällt",",","die"],["zielkonflikte","zwischen"],["treffsicherheit"],["der"],["aussagen","und"],["unterhaltung"],["oder","zwischen"],["treffsicherheit"],["und"],["parteitreue"],["zu","erkennen","."]],"target_groups":[["so","here"],["a","modest"],[","],["to","all"],["democracies"],["anzuwendender"],["proposal"],[":"],["the","market","for"],["ideas"],["works"],["better"],["if"],["citizens"],["easier"],[",","the"],["trade","@-@","offs","between"],["treffsicherheit"],["the"],["statements","and"],["entertainment"],["or"],["treffsicherheit"],["and"],["parteitreue"],["."]]}