diff options
Diffstat (limited to 'overlapping_rules')
-rw-r--r-- | overlapping_rules/README | 15 | ||||
-rwxr-xr-x | overlapping_rules/merge_rules.rb | 179 | ||||
-rwxr-xr-x | overlapping_rules/merge_rules_local.rb | 230 | ||||
-rwxr-xr-x | overlapping_rules/merge_rules_mapred.sh | 12 | ||||
-rwxr-xr-x | overlapping_rules/rules_cross_product.rb | 29 | ||||
-rwxr-xr-x | overlapping_rules/rules_cross_product_mapred.sh | 12 | ||||
-rw-r--r-- | overlapping_rules/util.rb | 116 | ||||
-rwxr-xr-x | overlapping_rules/word_pair_keys.rb | 19 | ||||
-rwxr-xr-x | overlapping_rules/word_pair_keys_mapred.sh | 12 |
9 files changed, 624 insertions, 0 deletions
diff --git a/overlapping_rules/README b/overlapping_rules/README new file mode 100644 index 0000000..5dffd16 --- /dev/null +++ b/overlapping_rules/README @@ -0,0 +1,15 @@ +1. word_pair_keys + group rules by source/target word pairs + input is a cdec grammar (with int index), one rule per line + +2. rules_cross_product + build cross product of rules w/ same key + input is output of 1 + +3. merge_rules + mapred version of merge_rules.rb + +NOTE + cross product doesn't even work with g120: + 319078851 megabytes ~= 300 terabytes + diff --git a/overlapping_rules/merge_rules.rb b/overlapping_rules/merge_rules.rb new file mode 100755 index 0000000..987d417 --- /dev/null +++ b/overlapping_rules/merge_rules.rb @@ -0,0 +1,179 @@ +#!/usr/bin/env ruby + +require_relative './util.rb' + +# ISSUES +# new rule already in global g? +# => uniq! +# singletons? => uniq -D +# R+S = R => X +# variants? +# + + +def is_sub(first, second, dir='left') + if dir=='right' + first = first.reverse; second = second.reverse + end + + second_index = 0 + match = false + first_range = Range.new; second_range = Range.new + first.each_with_index { |i,first_index| + break if i.match('\[X,\d\]') #(/\[X,\d\]/) + break if second_index > second.size-1 + if i == second[second_index] + if !match + first_range.from = first_index + first_range.to = first_index + second_range.from = second_index + second_range.to = second_index + match = true + else + first_range.to = first_index + second_range.to = second_index + end + second_index += 1 + else + first_range.from = first_range.to = second_range.from = second_range.to = nil + end + } + if dir=='right' && first_range.from&&first_range.to&&second_range.from&&second_range.to + first_range.correct(first.size-1) + second_range.correct(second.size-1) + end + return first_range, second_range if (first_range.from&&first_range.to&&second_range.from&&second_range.to)&& + (first_range.from>0||first_range.to>0||second_range.from>0||second_range.to>0) + return false +end + +def merge(r, s, r_f_range, s_f_range, r_e_range, s_e_range, dir) + #ret = [] ? + new_rule = Rule.new + new_rule.f = Array.new(s.f) + new_rule.e = Array.new(s.e) + new_rule.nt = r.nt + if dir == 'left' + return nil if r_f_range.from==0&&r_e_range.from==0 + (r_f_range.from-1).downto(0) { |i| new_rule.f.unshift r.f[i] } + (r_e_range.from-1).downto(0) { |i| new_rule.e.unshift r.e[i] } + elsif dir == 'right' + return nil if r_f_range.from==r.f.size-1&&r_e_range.from==r.e.size-1 + (r_f_range.to+1).upto(r.f.size-1) { |i| new_rule.f << r.f[i] } + (r_e_range.to+1).upto(r.e.size-1) { |i| new_rule.e << r.e[i] } + end + return new_rule +end + +def test + a = ["der", "eurozone", "[X,1]", "die", "[X,2]"] + b = ["eurozone"] + x = ["the", "eurozone", "[X,1]", "[X,2]"] + y = ["eurozone", "members"] + + r,s = is_sub(a,b) + puts "#{r} #{s}" + + r,s = is_sub(b,a) + puts "#{r} #{s}" + + r,s = is_sub(x,y) + puts "#{r} #{s}" + + r,s = is_sub(y,x) + puts "#{r} #{s}" + + puts + puts + + a = ["schuld", "an", "[X,1]", "inflation"] + b = ["schuld"] + x = ["blamed", "for", "[X,1]", "inflation"] + y = ["for", "responsible"] + + r,s = is_sub(a,b) + puts "#{r} #{s}" + + r,s = is_sub(b,a) + puts "#{r} #{s}" + + r,s = is_sub(x,y) + puts "#{r} #{s}" + + r,s = is_sub(y,x) + puts "#{r} #{s}" + exit +end +test if ARGV[0] == 'test' + +# main +def main + strings = {} + while line = STDIN.gets + ids, data = line.split "\t", 2 + r_id, s_id = ids.split '+', 2 + r_id = r_id.to_s + s_id = s_id.to_s + r_raw, s_raw = data.split ' <<<>>> ', 2 + r = Rule.new r_raw, r_id + s = Rule.new s_raw, s_id + #STDERR.puts "reporter:status:processing rules #{r_id} and #{s_id}" + + # left, R->S + range_f_first, range_f_second = is_sub(r.f, s.f) + range_e_first, range_e_second = is_sub(r.e, s.e) + if range_f_first&&range_f_second&&range_e_first&&range_e_second + new_rule = merge(r, s, range_f_first, range_f_first, range_e_first, range_e_second, 'left') + if new_rule && !strings.has_key?(new_rule.rule_id_string) + #puts "R: #{r.f} ||| #{r.e}\nS: #{s.f} ||| #{s.e}" + #puts "f:(#{range_f_first} #{range_f_second}) e:(#{range_e_first} #{range_e_second})\n" + puts new_rule.to_s + strings[new_rule.rule_id_string] = true + #puts "\n" + end + end + + # left, S->R + range_f_first, range_f_second = is_sub(s.f, r.f) + range_e_first, range_e_second = is_sub(s.e, r.e) + if range_f_first&&range_f_second&&range_e_first&&range_e_second + new_rule = merge(s, r, range_f_first, range_f_first, range_e_first, range_e_second, 'left') + if new_rule && !strings.has_key?(new_rule.rule_id_string) + #puts "S: #{s.f} ||| #{s.e}\nR: #{r.f} ||| #{r.e}" + #puts "f:(#{range_f_first} #{range_f_second}) e:(#{range_e_first} #{range_e_second})\n" + puts new_rule.to_s + strings[new_rule.rule_id_string] = true + #puts "\n" + end + end + + # right, R->S + range_f_first, range_f_second = is_sub(r.f, s.f, 'right') + range_e_first, range_e_second = is_sub(r.e, s.e, 'right') + if range_f_first&&range_f_second&&range_e_first&&range_e_second + new_rule = merge(r, s, range_f_first, range_f_first, range_e_first, range_e_second, 'right') + if new_rule && !strings.has_key?(new_rule.rule_id_string) + #puts "Rr: #{r.f} ||| #{r.e}\nSr: #{s.f} ||| #{s.e}" + #puts "f:(#{range_f_first} #{range_f_second}) e:(#{range_e_first} #{range_e_second})\n" + puts new_rule.to_s + strings[new_rule.rule_id_string] = true + #puts "\n" + end + end + + # right, S->R + range_f_first, range_f_second = is_sub(s.f, r.f, 'right') + range_e_first, range_e_second = is_sub(s.e, r.e, 'right') + if range_f_first&&range_f_second&&range_e_first&&range_e_second + new_rule = merge(s, r, range_f_first, range_f_first, range_e_first, range_e_second, 'right') + if new_rule && !strings.has_key?(new_rule.rule_id_string) + #puts "Sr: #{s.f} ||| #{s.e}\nRr: #{r.f} ||| #{r.e}" + #puts "f:(#{range_f_first} #{range_f_second}) e:(#{range_e_first} #{range_e_second})\n" + puts new_rule.to_s + strings[new_rule.rule_id_string] = true if new_rule + #puts "\n" + end + end + end +end + diff --git a/overlapping_rules/merge_rules_local.rb b/overlapping_rules/merge_rules_local.rb new file mode 100755 index 0000000..34bdfe5 --- /dev/null +++ b/overlapping_rules/merge_rules_local.rb @@ -0,0 +1,230 @@ +#!/usr/bin/env ruby + +# ISSUES +# new rule already in global g? +# R+S = R +# variants? +# + + +class Rule + attr_accessor :nt, :f, :e, :features, :alignment + + def initialize(s=nil) + return if !s + a = s.strip.split ' ||| ' + @nt = a[0].strip + @f = a[1].split.map{|i| i.strip} + @e = a[2].split.map{|i| i.strip} + @features = {} + a[3].split.each { |i| + name,value = i.split '=' + @features[name] = value.to_f + } + @alignment = a[4].strip + end + + def to_s + feature_string = [] + @features.each_pair { |name,value| feature_string << "#{name}=#{value}" } if @features + feature_string = feature_string.join ' ' + return "#{@nt} ||| #{f.join ' '} ||| #{@e.join ' '} ||| #{feature_string} ||| #{@alignment}" + end + + def rule_string + return "#{@f.join '_'}|||#{@e.join '_'}" + end +end + +class Range + attr_accessor :from, :to + def initialize + @from = nil + @to = nil + end + def to_s + return "#{@from}--#{@to}" + end + def correct(n) + t = @from + @from = n - @to + @to = n - t + end +end + +def ignore(rule) + return true if (rule.f[0].match('\[X,\d\]')&&rule.f[-1].match('\[X,\d\]')&&rule.e[0].match('\[X,\d\]')&&rule.e[-1].match('\[X,\d\]')) + return false +end + +def is_sub(first, second, dir='left') + if dir=='right' + first = first.reverse; second = second.reverse + end + + second_index = 0 + match = false + first_range = Range.new; second_range = Range.new + first.each_with_index { |i,first_index| + break if i.match('\[X,\d\]') #(/\[X,\d\]/) + break if second_index > second.size-1 + if i == second[second_index] + if !match + first_range.from = first_index + first_range.to = first_index + second_range.from = second_index + second_range.to = second_index + match = true + else + first_range.to = first_index + second_range.to = second_index + end + second_index += 1 + else + first_range.from = first_range.to = second_range.from = second_range.to = nil + end + } + if dir=='right' && first_range.from&&first_range.to&&second_range.from&&second_range.to + first_range.correct(first.size-1) + second_range.correct(second.size-1) + end + return first_range, second_range if (first_range.from&&first_range.to&&second_range.from&&second_range.to)&& + (first_range.from>0||first_range.to>0||second_range.from>0||second_range.to>0) + return false +end + +def merge(r, s, r_f_range, s_f_range, r_e_range, s_e_range, dir) + #ret = [] + new_rule = Rule.new + new_rule.f = Array.new(s.f) + new_rule.e = Array.new(s.e) + if dir == 'left' + return nil if r_f_range.from==0&&r_e_range.from==0 + (r_f_range.from-1).downto(0) { |i| new_rule.f.unshift r.f[i] } + (r_e_range.from-1).downto(0) { |i| new_rule.e.unshift r.e[i] } + elsif dir == 'right' + return nil if r_f_range.from==r.f.size-1&&r_e_range.from==r.e.size-1 + (r_f_range.to+1).upto(r.f.size-1) { |i| new_rule.f << r.f[i] } + (r_e_range.to+1).upto(r.e.size-1) { |i| new_rule.e << r.e[i] } + end + return new_rule +end + +def test + a = ["der", "eurozone", "[X,1]", "die", "[X,2]"] + b = ["eurozone"] + x = ["the", "eurozone", "[X,1]", "[X,2]"] # ??? + y = ["eurozone", "members"] + + r,s = is_sub(a,b) + puts "#{r} #{s}" + + r,s = is_sub(b,a) + puts "#{r} #{s}" + + r,s = is_sub(x,y) + puts "#{r} #{s}" + + r,s = is_sub(y,x) + puts "#{r} #{s}" + + puts + puts + + a = ["schuld", "an", "[X,1]", "inflation"] + b = ["schuld"] + x = ["blamed", "for", "[X,1]", "inflation"] + y = ["for", "responsible"] + + r,s = is_sub(a,b) + puts "#{r} #{s}" + + r,s = is_sub(b,a) + puts "#{r} #{s}" + + r,s = is_sub(x,y) + puts "#{r} #{s}" + + r,s = is_sub(y,x) + puts "#{r} #{s}" + exit +end +test if ARGV[0] == 'test' + + +# read rules +rules = [] +strings = {} +if ARGV[0] != 'test' + while line = STDIN.gets + rules << Rule.new(line) + strings[rules[-1].rule_string] = true + end +end + +# main +done = {} +rules.each_with_index { |r,i| + next if ignore(r) + rules.each_with_index { |s,j| + next if done.has_key?(i)||ignore(s)||i==j + possible_overlap = false + r.f.each { |i| + if s.f.include? i + possible_overlap = true + break + end + } + next if !possible_overlap + + # left, R->S + range_f_first, range_f_second = is_sub(r.f, s.f) + range_e_first, range_e_second = is_sub(r.e, s.e) + if range_f_first&&range_f_second&&range_e_first&&range_e_second + #puts "R: #{r.f} ||| #{r.e}\nS: #{s.f} ||| #{s.e}" + #puts "f:(#{range_f_first} #{range_f_second}) e:(#{range_e_first} #{range_e_second})\n" + new_rule = merge(r, s, range_f_first, range_f_first, range_e_first, range_e_second, 'left') + #puts "NEW #{new_rule}" if new_rule + puts "X" if (new_rule && !strings.has_key?(new_rule.rule_string)) + #puts + end + + # left, S->R + range_f_first, range_f_second = is_sub(s.f, r.f) + range_e_first, range_e_second = is_sub(s.e, r.e) + if range_f_first&&range_f_second&&range_e_first&&range_e_second + #puts "S: #{s.f} ||| #{s.e}\nR: #{r.f} ||| #{r.e}" + #puts "f:(#{range_f_first} #{range_f_second}) e:(#{range_e_first} #{range_e_second})\n" + new_rule = merge(s, r, range_f_first, range_f_first, range_e_first, range_e_second, 'left') + #puts "NEW #{new_rule}" if new_rule + puts "X" if (new_rule && !strings.has_key?(new_rule.rule_string)) + #puts + end + + # right, R->S + range_f_first, range_f_second = is_sub(r.f, s.f, 'right') + range_e_first, range_e_second = is_sub(r.e, s.e, 'right') + if range_f_first&&range_f_second&&range_e_first&&range_e_second + #puts "Rr: #{r.f} ||| #{r.e}\nSr: #{s.f} ||| #{s.e}" + #puts "f:(#{range_f_first} #{range_f_second}) e:(#{range_e_first} #{range_e_second})\n" + new_rule = merge(r, s, range_f_first, range_f_first, range_e_first, range_e_second, 'right') + #puts "NEW #{new_rule}" if new_rule + puts "X" if (new_rule && !strings.has_key?(new_rule.rule_string)) + #puts + end + + # right, S->R + range_f_first, range_f_second = is_sub(s.f, r.f, 'right') + range_e_first, range_e_second = is_sub(s.e, r.e, 'right') + if range_f_first&&range_f_second&&range_e_first&&range_e_second + #puts "Sr: #{s.f} ||| #{s.e}\nRr: #{r.f} ||| #{r.e}" + #puts "f:(#{range_f_first} #{range_f_second}) e:(#{range_e_first} #{range_e_second})\n" + new_rule = merge(s, r, range_f_first, range_f_first, range_e_first, range_e_second, 'right') + #puts "NEW #{new_rule}" if new_rule + puts "X" if (new_rule && !strings.has_key?(new_rule.rule_string)) + #puts + end + } + done[i] = true +} + diff --git a/overlapping_rules/merge_rules_mapred.sh b/overlapping_rules/merge_rules_mapred.sh new file mode 100755 index 0000000..e1329f4 --- /dev/null +++ b/overlapping_rules/merge_rules_mapred.sh @@ -0,0 +1,12 @@ +#!/bin/sh + +hadoop jar /usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh4.1.2.jar \ + -D mapred.reduce.tasks=23 \ + -D mapred.task.timeout=120000000 \ + -input overlap/g120+index-16mb \ + -cacheFile 'hdfs://10.0.0.1:8020/user/simianer/overlap/g120+index#g120+index' \ + -output overlap/test-g120 \ + -mapper merge_rules.rb \ + -reducer /usr/bin/uniq \ + -file merge_rules.rb \ + diff --git a/overlapping_rules/rules_cross_product.rb b/overlapping_rules/rules_cross_product.rb new file mode 100755 index 0000000..649a94c --- /dev/null +++ b/overlapping_rules/rules_cross_product.rb @@ -0,0 +1,29 @@ +#!/usr/bin/env ruby + +require_relative './util.rb' + + +STDOUT.sync = true +approx_count = 7430933/192 + +i = 0 +prev_id = nil +accumulate = [] +read_rules_from_file2(STDIN, 'stdin') { |id,r| + i += 1 + if id != prev_id + if prev_id + accumulate.sort_by! { |i| i.id }#.uniq { |i| i.id } + accumulate.each_with_index { |a,j| + accumulate[j+1..accumulate.size].each_with_index { |b,k| + STDOUT.write "#{a.id}+#{b.id}\t#{a.to_s} <<<>>> #{b.to_s}\n" + } + } + accumulate.clear + end + prev_id = id + end + accumulate << r + STDERR.write "reporter:status:cross product ##{i} #{((i*100.0)/approx_count).round 2}%\n" +} + diff --git a/overlapping_rules/rules_cross_product_mapred.sh b/overlapping_rules/rules_cross_product_mapred.sh new file mode 100755 index 0000000..a7a96f7 --- /dev/null +++ b/overlapping_rules/rules_cross_product_mapred.sh @@ -0,0 +1,12 @@ +#!/bin/sh + +hadoop jar /usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh4.1.2.jar \ + -D mapred.reduce.tasks=384 \ + -D mapred.task.timeout=1200000000 \ + -input overlap/g120_word-pair-keys \ + -output overlap/g120_cross-product \ + -mapper "/bin/cat" \ + -reducer rules_cross_product.rb \ + -file rules_cross_product.rb \ + -file util.rb + diff --git a/overlapping_rules/util.rb b/overlapping_rules/util.rb new file mode 100644 index 0000000..5f1249a --- /dev/null +++ b/overlapping_rules/util.rb @@ -0,0 +1,116 @@ +class Rule + attr_accessor :nt, :f, :e, :features, :alignment, :id + + def initialize(s=nil, id=-1) + return if !s + @id = id + a = s.strip.split ' ||| ' + @nt = a[0].strip + @f = a[1].split.map{|i| i.strip} + @e = a[2].split.map{|i| i.strip} + @features = {} + a[3].split.each { |i| + name,value = i.split '=' + @features[name] = value.to_f + } + @alignment = a[4].strip + end + + def to_s + feature_string = [] + @features.each_pair { |name,value| feature_string << "#{name}=#{value}" } if @features + feature_string = feature_string.join ' ' + return "#{@nt} ||| #{f.join ' '} ||| #{@e.join ' '} ||| #{feature_string} ||| #{@alignment}" + end + + def rule_id_string + return "#{@f.join '_'}|||#{@e.join '_'}" + end + + def fe_word_pairs + a = [] + @f.each { |i| + next if i.match('\[X,\d\]') + @e.each { |j| + next if j.match('\[X,\d\]') + a << "#{[i,j].sort.join '|||'}" + } + } + return a.uniq # we do not want duplicates + end +end + + +class Range + attr_accessor :from, :to + def initialize + @from = nil + @to = nil + end + def to_s + return "#{@from}--#{@to}" + end + def correct(n) + t = @from + @from = n - @to + @to = n - t + end +end + + +def ignore(rule) + return true if (rule.f.first.match('\[X,\d\]')&&rule.f.last.match('\[X,\d\]')|| \ + rule.e.first.match('\[X,\d\]')&&rule.e.last.match('\[X,\d\]')) + return false +end + + +def read_rules_from_file f, fn, ids=nil + STDERR.puts "reporter:status:reading rules from #{fn}" + rules = [] + i = 0 + while line = f.gets + id, data = line.split "\t" + id = id.to_i + r = Rule.new(data, id) + next if ignore(r) + rules << r + ids[r.rule_id_string]=true if ids + i += 1 + STDERR.puts "reporter:status:reading rules from #{fn} (already read #{i} lines)" if i%10===0 + end + f.close + return rules +end + + +def read_rules_from_file1 f, fn, ids=nil + i = 0 + while line = f.gets + id, data = line.split "\t" + id = id.to_i + r = Rule.new(data, id) + next if ignore(r) + yield r + ids[r.rule_id_string]=true if ids + i += 1 + end + f.close +end + + +def read_rules_from_file2 f, fn, ids=nil + i = 0 + while line = f.gets + word_pair_key, data = line.split "\t" + id, rule_str = data.split " ||| ", 2 + id = id.to_i + r = Rule.new(rule_str, id) + next if ignore(r) # prevent overhead later on + yield word_pair_key, r + ids[r.rule_id_string]=true if ids + i += 1 + end + f.close +end + diff --git a/overlapping_rules/word_pair_keys.rb b/overlapping_rules/word_pair_keys.rb new file mode 100755 index 0000000..8b54441 --- /dev/null +++ b/overlapping_rules/word_pair_keys.rb @@ -0,0 +1,19 @@ +#!/usr/bin/env ruby + +require_relative './util.rb' + + +fn = 'newstest2008-grammar+index' +approx_lines_per_shard = 12285856#/23 +STDOUT.sync = true + +i = 0 +read_rules_from_file1(STDIN, 'stdin') { |r| + i += 1 + shard_percentage = ((i*100.0)/approx_lines_per_shard).round 2 + STDERR.write "reporter:status:word pair key ##{i} #{shard_percentage}%\n" + r.fe_word_pairs.each { |p| + puts "#{p}\t#{r.id} ||| #{r.to_s}" + } +} + diff --git a/overlapping_rules/word_pair_keys_mapred.sh b/overlapping_rules/word_pair_keys_mapred.sh new file mode 100755 index 0000000..3c6a3c3 --- /dev/null +++ b/overlapping_rules/word_pair_keys_mapred.sh @@ -0,0 +1,12 @@ +#!/bin/sh + +hadoop jar /usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh4.1.2.jar \ + -D mapred.reduce.tasks=24 \ + -D mapred.task.timeout=1200000000 \ + -input overlap/newstest2008-grammar+index \ + -output overlap/newstest2008-grammar_word-pair-keys \ + -mapper word_pair_keys.rb \ + -reducer "/bin/cat" \ + -file word_pair_keys.rb \ + -file util.rb + |