summaryrefslogtreecommitdiff
path: root/derivation_to_json/rec.rb
blob: 84bdc0dee46ccee88a78ec4c027d06f0d37ae0cc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/env ruby

require 'json'
require 'zipf'


before = JSON.parse(ReadFile.read('in7.json'))
after = JSON.parse(ReadFile.read('out7.json'))

alignment = {}
after["align"].each { |i|
  a,b = i.split '-'
  a = a.to_i
  b = b.to_i
  if alignment[a]
    alignment[a] << b
  else
    alignment[a] = [b]
  end
}

srg2idx = {}
before['source_rgroups'].uniq.each { |k|
  srg2idx[k] = []
  before['source_rgroups'].each_with_index { |i,j|
    if i==k
      srg2idx[k] << j
    end
  } 
}

def get_target_phrases_for_source_span before, after, alignment, v, dontsort=false
  a = []
  tgt = []
  target_phrases = [] # alignment seen from target
  v.each { |i|
    a << after["source"][i]
    target_phrases << alignment[i].first if alignment[i]
  }
  target_phrases.sort! if !dontsort
  target_phrases.each { |j|
    tgt << after["target"][j]
  }

  return a, tgt, target_phrases
end


# k is a rule id in after['rules_by_span_id']
srg2idx.each_pair { |k,v|
  a, tgt, target_phrases = get_target_phrases_for_source_span before, after, alignment, v
  rule_before = before['rules_by_span_id'][k.to_s]
  src_side_before = splitpipe(rule_before)[1]
  x = src_side_before.split
  a.first.insert(0, " [X] ") if x[0] == "[X]"
  a[a.size-1] += " [X] " if x[x.size-1] == "[X]"
  puts rule_before
  puts "#{k} #{a.join " [X] "}"
  puts tgt.to_s
  puts before["span_info"][k.to_s].to_s
  puts "target phrases #{target_phrases}"
  s = ""
  target_phrases.uniq.each { |j| s += after["target"][j]+" " }
  puts "S: #{s}"
  puts "nothing to do" if before["span_info"][k.to_s][1].size==0
  target_phrase_sub = []
  before["span_info"][k.to_s][1].each { |subspan|
    puts subspan.to_s
    subid = before["span2id"][subspan.to_s]
    puts "subid #{subid}"
    puts "XXX #{srg2idx[subid]}"
    _, _, tp = get_target_phrases_for_source_span before, after, alignment, srg2idx[subid], true
    target_phrase_sub << tp
  }
  puts "targ ph sub #{target_phrase_sub.to_s}"
  puts "---"
  puts
}