diff options
Diffstat (limited to 'derivation_to_json')
-rwxr-xr-x | derivation_to_json/derivation_to_json.rb | 30 | ||||
-rwxr-xr-x | derivation_to_json/rec.rb | 37 |
2 files changed, 54 insertions, 13 deletions
diff --git a/derivation_to_json/derivation_to_json.rb b/derivation_to_json/derivation_to_json.rb index 3a4eb65..b14b0b5 100755 --- a/derivation_to_json/derivation_to_json.rb +++ b/derivation_to_json/derivation_to_json.rb @@ -42,9 +42,10 @@ class RuleAndSpan end class Rule - attr_accessor :nt, :f, :e, :v, :a, :ha, :source_groups, :target_groups + attr_accessor :nt, :f, :e, :v, :a, :ha, :source_groups, :target_groups, :raw_rule_str def initialize s + @raw_rule_str = s.strip splitpipe(s).each_with_index { |i,j| i = i.strip.lstrip if j == 0 # NT @@ -115,7 +116,8 @@ class Rule end def to_s - "#{@nt} ||| #{@f} ||| #{@e} ||| #{@v} ||| #{@a}\n" + #"#{@nt} ||| #{@f} ||| #{@e} ||| #{@v} ||| #{@a}\n" + "#{raw_rule_str}" end end @@ -138,7 +140,7 @@ def conv_cdec_show_deriv s return a, rules end -def derive span, spans, by_span, o, groups, source +def derive span, by_span, o, groups, source if groups.size==0 || groups.last.size>0 groups << [] end @@ -152,7 +154,7 @@ def derive span, spans, by_span, o, groups, source nt = w.match /\[(\d+)\]/ if nt idx = nt.captures.first.to_i-1 - _ = derive by_span[span.subspans[idx]], spans, by_span, o, groups, source + _ = derive by_span[span.subspans[idx]], by_span, o, groups, source (k+1).upto(a.size-1) { |i| if !a[i].match(/\[(\d+)\]/) && groups.last.size>0 groups << [] @@ -205,7 +207,7 @@ def proc_deriv s source_groups = [] spans.each { |span| next if by_span[span].done - derive by_span[span], spans, by_span, so, source_groups, true + derive by_span[span], by_span, so, source_groups, true } spans.each { |s| by_span[s].done = false } @@ -214,7 +216,7 @@ def proc_deriv s groups = [] spans.each { |span| next if by_span[span].done - derive by_span[span], spans, by_span, o, groups, false + derive by_span[span], by_span, o, groups, false } source_rgroups = [] @@ -226,6 +228,7 @@ def proc_deriv s rules_by_span_id[i.first[1]] = i.first[2] } + # make/fake phrase alignment phrase_align = [] count_source = {} count_target = {} @@ -256,12 +259,9 @@ def proc_deriv s end } } - puts add_to.to_s - puts phrase_align.to_s add_to.each { |k| phrase_align[k] << j } - puts phrase_align.to_s end } @@ -281,6 +281,15 @@ def proc_deriv s } } + # span info + span_info = {} + span2id = {} + by_span.each { |k,v| + span_info[v.id] = [k, v.subspans] + span2id[k] = v.id + } + + # final object h = {} h[:phrase_alignment] = phrase_align h[:source_rgroups] = source_rgroups @@ -288,6 +297,8 @@ def proc_deriv s h[:rules_by_span_id] = rules_by_span_id h[:source_groups] = source_groups.map { |a| a.map { |i| i.first }.join " " } h[:target_groups] = groups.map { |a| a.map { |i| i.first }.join " " } + h[:span_info] = span_info + h[:span2id] = span2id return h.to_json end @@ -300,6 +311,7 @@ if __FILE__ == $0 json = proc_deriv(s) obj = JSON.parse(json) STDERR.write "#{json}\n" + puts obj["source_groups"].join " " puts obj["target_groups"].join " " end diff --git a/derivation_to_json/rec.rb b/derivation_to_json/rec.rb index 677a02a..84bdc0d 100755 --- a/derivation_to_json/rec.rb +++ b/derivation_to_json/rec.rb @@ -4,8 +4,8 @@ require 'json' require 'zipf' -before = JSON.parse(ReadFile.read('x.json')) -after = JSON.parse(ReadFile.read('y.json')) +before = JSON.parse(ReadFile.read('in7.json')) +after = JSON.parse(ReadFile.read('out7.json')) alignment = {} after["align"].each { |i| @@ -29,13 +29,26 @@ before['source_rgroups'].uniq.each { |k| } } -srg2idx.each_pair { |k,v| +def get_target_phrases_for_source_span before, after, alignment, v, dontsort=false a = [] tgt = [] + target_phrases = [] # alignment seen from target v.each { |i| a << after["source"][i] - tgt << after["target"][alignment[i].first] + target_phrases << alignment[i].first if alignment[i] + } + target_phrases.sort! if !dontsort + target_phrases.each { |j| + tgt << after["target"][j] } + + return a, tgt, target_phrases +end + + +# k is a rule id in after['rules_by_span_id'] +srg2idx.each_pair { |k,v| + a, tgt, target_phrases = get_target_phrases_for_source_span before, after, alignment, v rule_before = before['rules_by_span_id'][k.to_s] src_side_before = splitpipe(rule_before)[1] x = src_side_before.split @@ -44,6 +57,22 @@ srg2idx.each_pair { |k,v| puts rule_before puts "#{k} #{a.join " [X] "}" puts tgt.to_s + puts before["span_info"][k.to_s].to_s + puts "target phrases #{target_phrases}" + s = "" + target_phrases.uniq.each { |j| s += after["target"][j]+" " } + puts "S: #{s}" + puts "nothing to do" if before["span_info"][k.to_s][1].size==0 + target_phrase_sub = [] + before["span_info"][k.to_s][1].each { |subspan| + puts subspan.to_s + subid = before["span2id"][subspan.to_s] + puts "subid #{subid}" + puts "XXX #{srg2idx[subid]}" + _, _, tp = get_target_phrases_for_source_span before, after, alignment, srg2idx[subid], true + target_phrase_sub << tp + } + puts "targ ph sub #{target_phrase_sub.to_s}" puts "---" puts } |