From b385228969382185afee33d0661790b2831ac6c1 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Tue, 20 May 2014 14:40:55 +0200 Subject: intersect2 correct & fast (ok, at least not impossibly slow) --- README.md | 1 + example/grammar | 5 +- grammar.rb | 24 ++++-- intersect.rb | 47 +++++++----- intersect1.rb | 221 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ intersect2.rb | 176 ++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 443 insertions(+), 31 deletions(-) create mode 100644 intersect1.rb create mode 100644 intersect2.rb diff --git a/README.md b/README.md index 5559e0d..30acc8b 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,3 @@ nothing to see here +https://github.com/jweese/thrax/wiki/Glue-grammar diff --git a/example/grammar b/example/grammar index 1d72ce5..4a3c93c 100644 --- a/example/grammar +++ b/example/grammar @@ -1,8 +1,5 @@ [S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 -[S] ||| ich sah ein kleines haus ||| -[S] ||| ich [VP,2] ||| i [1] [2] ||| logp=0 -[S] ||| ich sah ein [NN,1] haus ||| i saw a [NN,1] house ||| logp=0 -[S] ||| ich [V,1] ein [NN,1] haus ||| i [1] a [2] house ||| logp=0 +[S] ||| ich sah ein [NN,1] ||| [NP] ||| ich ||| i ||| logp=-0.5 use_i=1.0 [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 use_a=1.0 [NN] ||| kleines haus ||| small house ||| logp=0 use_house=1 diff --git a/grammar.rb b/grammar.rb index 8a08cc1..4183a8f 100644 --- a/grammar.rb +++ b/grammar.rb @@ -65,15 +65,23 @@ class Rule end class Grammar - attr_accessor :rules + attr_accessor :rules, :rewrite, :flat, :mixed def initialize fn - @rules = [] - @glue_rules = [] - ReadFile.readlines_strip(fn).each_with_index { |s,j| + @rules = []; @rewrite = []; @flat = []; @mixed = [] + ReadFile.readlines_strip(fn).each_with_index { |s,i| STDERR.write '.' - STDERR.write "\n" if (j+1)%80==0 + STDERR.write "\n" if (i+1)%80==0 @rules << Rule.from_s(s) + if @rules.last.rhs.first.class == NT + @rewrite << @rules.last + else + if rules.last.arity == 0 + @flat << @rules.last + else + @mixed << @rules.last + end + end } STDERR.write "\n" end @@ -85,18 +93,18 @@ class Grammar end def add_glue_rules - # see https://github.com/jweese/thrax/wiki/Glue-grammar @rules.map { |r| r.lhs.symbol }.reject { |s| s=='S' }.uniq.each { |s| @rules << Rule.new(NT.new('S'), [NT.new(s)]) - @glue_rules << @rules.last + @rewrite << @rules.last @rules << Rule.new(NT.new('S'), [NT.new('S'), NT.new('X')]) - @glue_rules << @rules.last + @rewrite << @rules.last } end def add_pass_through_rules input input.each { |terminal| @rules << Rule.new(NT.new('X'), [T.new(terminal.word)]) + @flat << @rules.last } end end diff --git a/intersect.rb b/intersect.rb index fa4a3bd..64b9696 100644 --- a/intersect.rb +++ b/intersect.rb @@ -68,7 +68,8 @@ end def init active_chart, passive_chart, grammar, input, n # pre-fill passive chart w/ 0-arity rules - grammar.rules.select { |r| r.rhs.first.class==T }.each { |r| + #grammar.rules.select { |r| r.rhs.first.class==T }.each { |r| + grammar.rules.select { |r| r.arity==0 }.each { |r| input.each_index.select { |i| input[i].word==r.rhs.first.word }.each { |j| k = 1 if r.rhs.size > 1 @@ -86,16 +87,16 @@ def init active_chart, passive_chart, grammar, input, n end if k == r.rhs.size passive_chart.add(r, j, j+k, j+k, k) - else - (j+k).upto(n) { |l| active_chart.add r, j, l, j+k, k } + #else + # (j+k).upto(n) { |l| active_chart.add r, j, l, j+k, k } end } } # seed active chart - s = grammar.rules.reject { |r| r.rhs.first.class!=NT } - visit(n, n, 1) { |i,j| - s.each { |r| active_chart.add(r, i, j, i) } - } + #s = grammar.rules.reject { |r| r.rhs.first.class!=NT } + #visit(n, n, 1) { |i,j| + # s.each { |r| active_chart.add(r, i, j, i) } + #} end def scan item, passive_chart, input @@ -111,11 +112,19 @@ def scan item, passive_chart, input end end -def parse i, j, n, active_chart, passive_chart, input - active_chart.at(i,j).each { |active_item| +def parse i, j, n, active_chart, passive_chart, input, grammar + grammar.rules.each { |r| + if r.rhs.first.class==T&&r.rhs.first.word==input[i].word + new_item = Item.new r + scan new_item, passive_chart, input + end + } + #active_chart.at(i,j).each { |active_item| 1.upto(n) { |span| break if span==(j-i) i.upto(j-span) { |k| + STDERR.write " #{k},#{k+span}\n" + <<-DOC passive_chart.at(k, k+span).each { |passive_item| if active_item.rhs[active_item.dot].class==NT && passive_item.lhs.symbol == active_item.rhs[active_item.dot].symbol next if not active_item.span.right==passive_item.span.left @@ -131,10 +140,10 @@ def parse i, j, n, active_chart, passive_chart, input end end } + DOC } } - true - } + #} # self-filling to_add_active = [] to_add_passive = [] @@ -164,14 +173,14 @@ def preprocess s end def main - #input = preprocess 'ich sah ein kleines haus' - input = preprocess 'lebensmittel schuld an europäischer inflation' + input = preprocess 'ich sah ein kleines haus' + #input = preprocess 'lebensmittel schuld an europäischer inflation' #input = preprocess 'offizielle prognosen sind von nur 3 prozent ausgegangen , meldete bloomberg .' n = input.size puts 'reading grammar' - #g = Grammar.new 'example/grammar' - g = Grammar.new 'example/grammar.x' + g = Grammar.new 'example/grammar' + #g = Grammar.new 'example/grammar.x' #g = Grammar.new 'example/grammar.3.gz' # 4th segment of newstest2008 puts 'adding glue rules' @@ -187,12 +196,12 @@ def main puts 'parsing' visit(n, n, 1) { |i,j| - STDERR.write " span (#{i}, #{j})\n" - parse i, j, n, active_chart, passive_chart, input + STDERR.write "span (#{i}, #{j})\n" + parse i, j, n, active_chart, passive_chart, input, g } - puts "---\npassive chart" - visit(n, n, 0) { |i,j| puts "#{i},#{j}"; passive_chart.at(i,j).each { |item| puts ' '+item.to_s if item.span.left==i&&item.span.right==j }; puts } + #puts "---\npassive chart" + #visit(n, n, 0) { |i,j| puts "#{i},#{j}"; passive_chart.at(i,j).each { |item| puts ' '+item.to_s if item.span.left==i&&item.span.right==j }; puts } end diff --git a/intersect1.rb b/intersect1.rb new file mode 100644 index 0000000..fd465b0 --- /dev/null +++ b/intersect1.rb @@ -0,0 +1,221 @@ +#!/usr/bin/env ruby + +require_relative './grammar.rb' +STDOUT.sync = true + + +class Chart + attr_accessor :has + + def initialize n + @m = [] + (n+1).times { + _ = [] + (n+1).times { _ << [] } + @m << _ + } + @has = {} + end + + def at i, j + @m[i][j] + end + + def add rule_or_item, i, j, right, dot=0 + item = Item.new(rule_or_item) + item.span.left = i + item.span.right = right + item.dot = dot + at(i, j) << item + end +end + +class Span + attr_accessor :left, :right + + def initialize left=nil, right=nil + @left = left + @right = right + end +end + +class Item < Rule + attr_accessor :lhs, :rhs, :span, :dot + + def initialize rule_or_item + @lhs = rule_or_item.lhs.dup + @rhs = rule_or_item.rhs.dup + if rule_or_item.class == Item + @span = Span.new rule_or_item.span.left, rule_or_item.span.right + @dot = rule_or_item.dot + else + @span = Span.new + @dot = 0 + end + end + + def to_s + "#{lhs} -> #{rhs.map{|i|i.to_s}.insert(@dot,'*').join ' '} [dot@#{@dot}] [arity=#{arity}] (#{@span.left}, #{@span.right})" + end +end + +def visit n, depth, skip=0 + (depth-skip).times { |i| + i += skip + 0.upto(n-(i+1)) { |j| + yield j, j+i+1 + } + } +end + +def preprocess s + s.split.map { |i| T.new i } +end + +def init input, n, active_chart, passive_chart, grammar + grammar.rules.select { |r| r.arity==0 }.each { |r| + input.each_index { |i| + if input[i, r.rhs.size].map{|x|x.word}==r.rhs.map{|x|x.word} + passive_chart.add Item.new(r), i, i+r.rhs.size, i+r.rhs.size, r.rhs.size + passive_chart.d["#{i},#{i+r.rhs.size},#{r.lhs.symbol}"] = true + end + } + } +end + +def scan item, passive_chart, input + while item.rhs[item.dot].class == T + break if item.span.right > input.size-1 + if item.rhs[item.dot].word == input[item.span.right].word + item.dot += 1 + item.span.right += 1 + break if item.dot == item.rhs.size + else + break + end + end +end + +def parse input, n, active_chart, passive_chart, grammar + rules_starting_with_t = grammar.rules.select { |r| r.arity > 0 && r.rhs.first.class==T } + rules_starting_with_nt = grammar.rules.select { |r| r.rhs.first.class == NT } + + # outer loop + 2.upto(n) { |span| + 0.upto(n-span) { |k| + + # try to apply rules starting with T + rules_starting_with_t.select { |r| r.rhs.first.word == input[k].word }.each { |r| + new_item = Item.new r + new_item.span.left = k + new_item.span.right = k + new_item.dot = 0 + scan new_item, passive_chart, input + active_chart.at(k, k+span) << new_item if new_item.span.right < k+span + } + + # seed active chart + rules_starting_with_nt.each { |r| + # no eps-productions! + next if r.rhs.size > span + #puts "#{r.rhs.size} #{span}" + new_item = Item.new r + new_item.span.left = k + new_item.span.right = k + new_item.dot = 0 + active_chart.at(k, k+span) << new_item + } + + puts "#{k} #{k+span}" + + # inner loop + puts "SIZE #{active_chart.at(k,k+span).size}" + while !active_chart.at(k,k+span).empty? + active_item = active_chart.at(k, k+span).pop + + 1.upto(span-1) { |span2| + k.upto((k+span)-span2) { |l| + + #puts "passive chart sz #{passive_chart.at(l, l+span2).size} #{l},#{l+span2}" + #passive_chart.at(l, l+span2).select { |item| item.lhs.symbol==active_item.rhs[active_item.dot].symbol }.each { |passive_item| + # if l==0&&span2==2 + # puts " #{passive_item}" + # end + if passive_chart.d["#{l},#{l+span2},#{active_item.rhs[active_item.dot].symbol}"] + if l == active_item.span.right + new_item = Item.new active_item + new_item.span.left = active_item.span.left + new_item.span.right = l+span2 + new_item.dot += 1 + scan new_item, passive_chart, input + if new_item.dot == new_item.rhs.size + if new_item.span.left==k&&new_item.span.right==(k+span) + #puts "NP@#{k},#{k+span} #{new_item}" + passive_chart.at(k,k+span) << new_item + end + else + if new_item.rhs[new_item.dot].class==NT && new_item.span.right < (k+span) + #puts "NA@#{k},#{k+span} #{new_item}" + active_chart.at(k,k+span) << new_item + end + end + end + #} + end + +# # try to advance items +# passive_chart.at(l, l+span2).each { |passive_item| +# active_chart.at(k, k+span).select { |active_item| active_item.rhs[active_item.dot].symbol == passive_item.lhs.symbol }.each { |active_item| +# if passive_item.span.left == active_item.span.right +# new_item = Item.new active_item +# new_item.span.left = active_item.span.left +# new_item.span.right = passive_item.span.right +# new_item.dot += 1 +# scan new_item, passive_chart, input +# if new_item.dot == new_item.rhs.size +# if new_item.span.left==k&&new_item.span.right==(k+span) +# puts "NP@#{k},#{k+span} #{new_item}" +# passive_chart.at(k,k+span) << new_item +# end +# else +# # item is dead unless it has an NT to fill +# if new_item.rhs[new_item.dot].class==NT && new_item.span.right < (k+span) +# puts "NA@#{k},#{k+span} #{new_item}" +# active_chart.at(k,k+span) << new_item +# end +# end +# end +# } +# } +# + + } + } + + end + + } + } +end + +def main + #input = preprocess 'ich sah ein kleines haus' + #input = preprocess 'lebensmittel schuld an europäischer inflation' + input = preprocess 'offizielle prognosen sind von nur 3 prozent ausgegangen , meldete bloomberg .' + n = input.size + g = Grammar.new 'example/grammar.3.gz' + #g.add_glue_rules + passive_chart = Chart.new n + active_chart = Chart.new n + init input, n, active_chart, passive_chart, g + parse input, n, active_chart, passive_chart, g + #puts "---\npassive chart" + #visit(n, n, 0) { |i,j| puts "#{i},#{j}"; passive_chart.at(i,j).each { |item| puts ' '+item.to_s if item.span.left==i&&item.span.right==j }; puts } + #puts "\nactive chart" + #visit(n, n, 0) { |i,j| puts "#{i},#{j}"; active_chart.at(i,j).each { |item| puts ' '+item.to_s }; puts } + +end + + +main + diff --git a/intersect2.rb b/intersect2.rb new file mode 100644 index 0000000..0d82c1d --- /dev/null +++ b/intersect2.rb @@ -0,0 +1,176 @@ +#!/usr/bin/env ruby + +require_relative './grammar.rb' +STDOUT.sync = true + + +class Chart + + def initialize n + @m = [] + (n+1).times { + _ = [] + (n+1).times { _ << [] } + @m << _ + } + @b = {} + end + + def at i, j + @m[i][j] + end + + def add i, j, item + at(i,j) << item + @b["#{i},#{j},#{item.lhs.symbol}"] = true + end + + def has symbol, i, j + return @b["#{i},#{j},#{symbol}"] + end +end + +class Span + attr_accessor :left, :right + + def initialize left=nil, right=nil + @left = left + @right = right + end +end + +class Item < Rule + attr_accessor :lhs, :rhs, :span, :dot + + def initialize rule_or_item, left, right, dot + @lhs = rule_or_item.lhs + @rhs = rule_or_item.rhs.dup + @span = Span.new left, right + @dot = dot + end + + def to_s + "#{lhs} -> #{rhs.map{|i|i.to_s}.insert(@dot,'*').join ' '} [dot@#{@dot}] [arity=#{arity}] (#{@span.left}, #{@span.right})" + end +end + +def init input, n, active_chart, passive_chart, grammar + grammar.flat.each { |r| + input.each_index { |i| + if input[i, r.rhs.size] == r.rhs.map { |x| x.word } + new_item = Item.new r, i, i+r.rhs.size, r.rhs.size + passive_chart.add i, i+r.rhs.size, new_item + end + } + } +end + +def scan item, input, limit, passive_chart + while item.rhs[item.dot].class == T && item.span.right < limit + if item.rhs[item.dot].word == input[item.span.right] + item.dot += 1 + item.span.right += 1 + else + break + end + end +end + +def parse input, n, active_chart, passive_chart, grammar + 2.upto(n) { |span| # outer loop + 0.upto(n-span) { |k| + + puts " span(#{k},#{k+span})" + + # try to apply rules starting with T + grammar.mixed.select { |r| r.rhs.first.word == input[k] }.each { |r| + new_item = Item.new r, k, k, 0 + scan new_item, input, k+span, passive_chart + active_chart.at(k,k+span) << new_item + } + + # seed active chart + grammar.rewrite.each { |r| + next if r.rhs.size > span + active_chart.at(k,k+span) << Item.new(r, k, k, 0) + } + + active_chart.at(k,k+span).each { |active_item| + next if active_item.rhs[active_item.dot].class==T + # inner loop + 1.upto(span-1) { |span2| + k.upto((k+span)-span2) { |l| + + if passive_chart.has active_item.rhs[active_item.dot].symbol, l, l+span2 + if l == active_item.span.right + new_item = Item.new active_item, active_item.span.left, l+span2, active_item.dot+1 + scan new_item, input, k+span, passive_chart + if new_item.dot == new_item.rhs.size # done with item + if new_item.span.left == k && new_item.span.right == k+span + passive_chart.add k, k+span, new_item + end + else + if new_item.rhs[new_item.dot].class == NT && new_item.span.right+(new_item.rhs.size-(new_item.dot)) <= k+span + active_chart.at(k,k+span) << new_item + end + end + end + end + } + } + } + + # 'self-filling' step + passive_chart.at(k,k+span).each { |passive_item| + active_chart.at(k,k+span).each { |active_item| + next if active_item.rhs[active_item.dot].class!=NT + if passive_item.lhs.symbol == active_item.rhs[active_item.dot].symbol + next if not active_item.span.right==passive_item.span.left + new_item = Item.new active_item, active_item.span.left, passive_item.span.right, active_item.dot+1 + scan new_item, input, k+span, passive_chart + if new_item.dot == new_item.rhs.size + if new_item.span.left == k && new_item.span.right == k+span + passive_chart.add k, k+span, new_item + else + puts "#{new_item}" # FIXME never happens + end + else + if new_item.rhs[new_item.dot].class == NT && new_item.span.right+(new_item.rhs.size-(new_item.dot)) <= k+span + puts "NA@#{k},#{k+span} #{new_item}" + active_chart.at(k,k+span) << new_item + end + end + end + } + } + } + } +end + +def visit n, depth, skip=0 # FIXME + (depth-skip).times { |i| + i += skip + 0.upto(n-(i+1)) { |j| + yield j, j+i+1 + } + } +end + +def main + #input = 'ich sah ein kleines haus'.split + input = 'lebensmittel schuld an europäischer inflation'.split + #input = 'offizielle prognosen sind von nur 3 prozent ausgegangen , meldete bloomberg .'.split + n = input.size + grammar = Grammar.new 'example/grammar.x' + grammar.add_glue_rules + passive_chart = Chart.new n + active_chart = Chart.new n + init input, n, active_chart, passive_chart, grammar + parse input, n, active_chart, passive_chart, grammar + puts "---\npassive chart" + visit(n, n, 0) { |i,j| puts "#{i},#{j}"; passive_chart.at(i,j).each { |item| puts ' '+item.to_s }; puts } +end + + +main + -- cgit v1.2.3