summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2014-05-20 14:40:55 +0200
committerPatrick Simianer <p@simianer.de>2014-05-20 14:40:55 +0200
commitb385228969382185afee33d0661790b2831ac6c1 (patch)
tree61f49eed0a21fd146b059605d5e85a28f40f2053
parent25d3f57028fecfcc6ee344a89577542368e92bf3 (diff)
intersect2 correct & fast (ok, at least not impossibly slow)
-rw-r--r--README.md1
-rw-r--r--example/grammar5
-rw-r--r--grammar.rb24
-rw-r--r--intersect.rb47
-rw-r--r--intersect1.rb221
-rw-r--r--intersect2.rb176
6 files changed, 443 insertions, 31 deletions
diff --git a/README.md b/README.md
index 5559e0d..30acc8b 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,3 @@
nothing to see here
+https://github.com/jweese/thrax/wiki/Glue-grammar
diff --git a/example/grammar b/example/grammar
index 1d72ce5..4a3c93c 100644
--- a/example/grammar
+++ b/example/grammar
@@ -1,8 +1,5 @@
[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0
-[S] ||| ich sah ein kleines haus |||
-[S] ||| ich [VP,2] ||| i [1] [2] ||| logp=0
-[S] ||| ich sah ein [NN,1] haus ||| i saw a [NN,1] house ||| logp=0
-[S] ||| ich [V,1] ein [NN,1] haus ||| i [1] a [2] house ||| logp=0
+[S] ||| ich sah ein [NN,1] |||
[NP] ||| ich ||| i ||| logp=-0.5 use_i=1.0
[NP] ||| ein [NN,1] ||| a [1] ||| logp=0 use_a=1.0
[NN] ||| kleines haus ||| small house ||| logp=0 use_house=1
diff --git a/grammar.rb b/grammar.rb
index 8a08cc1..4183a8f 100644
--- a/grammar.rb
+++ b/grammar.rb
@@ -65,15 +65,23 @@ class Rule
end
class Grammar
- attr_accessor :rules
+ attr_accessor :rules, :rewrite, :flat, :mixed
def initialize fn
- @rules = []
- @glue_rules = []
- ReadFile.readlines_strip(fn).each_with_index { |s,j|
+ @rules = []; @rewrite = []; @flat = []; @mixed = []
+ ReadFile.readlines_strip(fn).each_with_index { |s,i|
STDERR.write '.'
- STDERR.write "\n" if (j+1)%80==0
+ STDERR.write "\n" if (i+1)%80==0
@rules << Rule.from_s(s)
+ if @rules.last.rhs.first.class == NT
+ @rewrite << @rules.last
+ else
+ if rules.last.arity == 0
+ @flat << @rules.last
+ else
+ @mixed << @rules.last
+ end
+ end
}
STDERR.write "\n"
end
@@ -85,18 +93,18 @@ class Grammar
end
def add_glue_rules
- # see https://github.com/jweese/thrax/wiki/Glue-grammar
@rules.map { |r| r.lhs.symbol }.reject { |s| s=='S' }.uniq.each { |s|
@rules << Rule.new(NT.new('S'), [NT.new(s)])
- @glue_rules << @rules.last
+ @rewrite << @rules.last
@rules << Rule.new(NT.new('S'), [NT.new('S'), NT.new('X')])
- @glue_rules << @rules.last
+ @rewrite << @rules.last
}
end
def add_pass_through_rules input
input.each { |terminal|
@rules << Rule.new(NT.new('X'), [T.new(terminal.word)])
+ @flat << @rules.last
}
end
end
diff --git a/intersect.rb b/intersect.rb
index fa4a3bd..64b9696 100644
--- a/intersect.rb
+++ b/intersect.rb
@@ -68,7 +68,8 @@ end
def init active_chart, passive_chart, grammar, input, n
# pre-fill passive chart w/ 0-arity rules
- grammar.rules.select { |r| r.rhs.first.class==T }.each { |r|
+ #grammar.rules.select { |r| r.rhs.first.class==T }.each { |r|
+ grammar.rules.select { |r| r.arity==0 }.each { |r|
input.each_index.select { |i| input[i].word==r.rhs.first.word }.each { |j|
k = 1
if r.rhs.size > 1
@@ -86,16 +87,16 @@ def init active_chart, passive_chart, grammar, input, n
end
if k == r.rhs.size
passive_chart.add(r, j, j+k, j+k, k)
- else
- (j+k).upto(n) { |l| active_chart.add r, j, l, j+k, k }
+ #else
+ # (j+k).upto(n) { |l| active_chart.add r, j, l, j+k, k }
end
}
}
# seed active chart
- s = grammar.rules.reject { |r| r.rhs.first.class!=NT }
- visit(n, n, 1) { |i,j|
- s.each { |r| active_chart.add(r, i, j, i) }
- }
+ #s = grammar.rules.reject { |r| r.rhs.first.class!=NT }
+ #visit(n, n, 1) { |i,j|
+ # s.each { |r| active_chart.add(r, i, j, i) }
+ #}
end
def scan item, passive_chart, input
@@ -111,11 +112,19 @@ def scan item, passive_chart, input
end
end
-def parse i, j, n, active_chart, passive_chart, input
- active_chart.at(i,j).each { |active_item|
+def parse i, j, n, active_chart, passive_chart, input, grammar
+ grammar.rules.each { |r|
+ if r.rhs.first.class==T&&r.rhs.first.word==input[i].word
+ new_item = Item.new r
+ scan new_item, passive_chart, input
+ end
+ }
+ #active_chart.at(i,j).each { |active_item|
1.upto(n) { |span|
break if span==(j-i)
i.upto(j-span) { |k|
+ STDERR.write " #{k},#{k+span}\n"
+ <<-DOC
passive_chart.at(k, k+span).each { |passive_item|
if active_item.rhs[active_item.dot].class==NT && passive_item.lhs.symbol == active_item.rhs[active_item.dot].symbol
next if not active_item.span.right==passive_item.span.left
@@ -131,10 +140,10 @@ def parse i, j, n, active_chart, passive_chart, input
end
end
}
+ DOC
}
}
- true
- }
+ #}
# self-filling
to_add_active = []
to_add_passive = []
@@ -164,14 +173,14 @@ def preprocess s
end
def main
- #input = preprocess 'ich sah ein kleines haus'
- input = preprocess 'lebensmittel schuld an europäischer inflation'
+ input = preprocess 'ich sah ein kleines haus'
+ #input = preprocess 'lebensmittel schuld an europäischer inflation'
#input = preprocess 'offizielle prognosen sind von nur 3 prozent ausgegangen , meldete bloomberg .'
n = input.size
puts 'reading grammar'
- #g = Grammar.new 'example/grammar'
- g = Grammar.new 'example/grammar.x'
+ g = Grammar.new 'example/grammar'
+ #g = Grammar.new 'example/grammar.x'
#g = Grammar.new 'example/grammar.3.gz' # 4th segment of newstest2008
puts 'adding glue rules'
@@ -187,12 +196,12 @@ def main
puts 'parsing'
visit(n, n, 1) { |i,j|
- STDERR.write " span (#{i}, #{j})\n"
- parse i, j, n, active_chart, passive_chart, input
+ STDERR.write "span (#{i}, #{j})\n"
+ parse i, j, n, active_chart, passive_chart, input, g
}
- puts "---\npassive chart"
- visit(n, n, 0) { |i,j| puts "#{i},#{j}"; passive_chart.at(i,j).each { |item| puts ' '+item.to_s if item.span.left==i&&item.span.right==j }; puts }
+ #puts "---\npassive chart"
+ #visit(n, n, 0) { |i,j| puts "#{i},#{j}"; passive_chart.at(i,j).each { |item| puts ' '+item.to_s if item.span.left==i&&item.span.right==j }; puts }
end
diff --git a/intersect1.rb b/intersect1.rb
new file mode 100644
index 0000000..fd465b0
--- /dev/null
+++ b/intersect1.rb
@@ -0,0 +1,221 @@
+#!/usr/bin/env ruby
+
+require_relative './grammar.rb'
+STDOUT.sync = true
+
+
+class Chart
+ attr_accessor :has
+
+ def initialize n
+ @m = []
+ (n+1).times {
+ _ = []
+ (n+1).times { _ << [] }
+ @m << _
+ }
+ @has = {}
+ end
+
+ def at i, j
+ @m[i][j]
+ end
+
+ def add rule_or_item, i, j, right, dot=0
+ item = Item.new(rule_or_item)
+ item.span.left = i
+ item.span.right = right
+ item.dot = dot
+ at(i, j) << item
+ end
+end
+
+class Span
+ attr_accessor :left, :right
+
+ def initialize left=nil, right=nil
+ @left = left
+ @right = right
+ end
+end
+
+class Item < Rule
+ attr_accessor :lhs, :rhs, :span, :dot
+
+ def initialize rule_or_item
+ @lhs = rule_or_item.lhs.dup
+ @rhs = rule_or_item.rhs.dup
+ if rule_or_item.class == Item
+ @span = Span.new rule_or_item.span.left, rule_or_item.span.right
+ @dot = rule_or_item.dot
+ else
+ @span = Span.new
+ @dot = 0
+ end
+ end
+
+ def to_s
+ "#{lhs} -> #{rhs.map{|i|i.to_s}.insert(@dot,'*').join ' '} [dot@#{@dot}] [arity=#{arity}] (#{@span.left}, #{@span.right})"
+ end
+end
+
+def visit n, depth, skip=0
+ (depth-skip).times { |i|
+ i += skip
+ 0.upto(n-(i+1)) { |j|
+ yield j, j+i+1
+ }
+ }
+end
+
+def preprocess s
+ s.split.map { |i| T.new i }
+end
+
+def init input, n, active_chart, passive_chart, grammar
+ grammar.rules.select { |r| r.arity==0 }.each { |r|
+ input.each_index { |i|
+ if input[i, r.rhs.size].map{|x|x.word}==r.rhs.map{|x|x.word}
+ passive_chart.add Item.new(r), i, i+r.rhs.size, i+r.rhs.size, r.rhs.size
+ passive_chart.d["#{i},#{i+r.rhs.size},#{r.lhs.symbol}"] = true
+ end
+ }
+ }
+end
+
+def scan item, passive_chart, input
+ while item.rhs[item.dot].class == T
+ break if item.span.right > input.size-1
+ if item.rhs[item.dot].word == input[item.span.right].word
+ item.dot += 1
+ item.span.right += 1
+ break if item.dot == item.rhs.size
+ else
+ break
+ end
+ end
+end
+
+def parse input, n, active_chart, passive_chart, grammar
+ rules_starting_with_t = grammar.rules.select { |r| r.arity > 0 && r.rhs.first.class==T }
+ rules_starting_with_nt = grammar.rules.select { |r| r.rhs.first.class == NT }
+
+ # outer loop
+ 2.upto(n) { |span|
+ 0.upto(n-span) { |k|
+
+ # try to apply rules starting with T
+ rules_starting_with_t.select { |r| r.rhs.first.word == input[k].word }.each { |r|
+ new_item = Item.new r
+ new_item.span.left = k
+ new_item.span.right = k
+ new_item.dot = 0
+ scan new_item, passive_chart, input
+ active_chart.at(k, k+span) << new_item if new_item.span.right < k+span
+ }
+
+ # seed active chart
+ rules_starting_with_nt.each { |r|
+ # no eps-productions!
+ next if r.rhs.size > span
+ #puts "#{r.rhs.size} #{span}"
+ new_item = Item.new r
+ new_item.span.left = k
+ new_item.span.right = k
+ new_item.dot = 0
+ active_chart.at(k, k+span) << new_item
+ }
+
+ puts "#{k} #{k+span}"
+
+ # inner loop
+ puts "SIZE #{active_chart.at(k,k+span).size}"
+ while !active_chart.at(k,k+span).empty?
+ active_item = active_chart.at(k, k+span).pop
+
+ 1.upto(span-1) { |span2|
+ k.upto((k+span)-span2) { |l|
+
+ #puts "passive chart sz #{passive_chart.at(l, l+span2).size} #{l},#{l+span2}"
+ #passive_chart.at(l, l+span2).select { |item| item.lhs.symbol==active_item.rhs[active_item.dot].symbol }.each { |passive_item|
+ # if l==0&&span2==2
+ # puts " #{passive_item}"
+ # end
+ if passive_chart.d["#{l},#{l+span2},#{active_item.rhs[active_item.dot].symbol}"]
+ if l == active_item.span.right
+ new_item = Item.new active_item
+ new_item.span.left = active_item.span.left
+ new_item.span.right = l+span2
+ new_item.dot += 1
+ scan new_item, passive_chart, input
+ if new_item.dot == new_item.rhs.size
+ if new_item.span.left==k&&new_item.span.right==(k+span)
+ #puts "NP@#{k},#{k+span} #{new_item}"
+ passive_chart.at(k,k+span) << new_item
+ end
+ else
+ if new_item.rhs[new_item.dot].class==NT && new_item.span.right < (k+span)
+ #puts "NA@#{k},#{k+span} #{new_item}"
+ active_chart.at(k,k+span) << new_item
+ end
+ end
+ end
+ #}
+ end
+
+# # try to advance items
+# passive_chart.at(l, l+span2).each { |passive_item|
+# active_chart.at(k, k+span).select { |active_item| active_item.rhs[active_item.dot].symbol == passive_item.lhs.symbol }.each { |active_item|
+# if passive_item.span.left == active_item.span.right
+# new_item = Item.new active_item
+# new_item.span.left = active_item.span.left
+# new_item.span.right = passive_item.span.right
+# new_item.dot += 1
+# scan new_item, passive_chart, input
+# if new_item.dot == new_item.rhs.size
+# if new_item.span.left==k&&new_item.span.right==(k+span)
+# puts "NP@#{k},#{k+span} #{new_item}"
+# passive_chart.at(k,k+span) << new_item
+# end
+# else
+# # item is dead unless it has an NT to fill
+# if new_item.rhs[new_item.dot].class==NT && new_item.span.right < (k+span)
+# puts "NA@#{k},#{k+span} #{new_item}"
+# active_chart.at(k,k+span) << new_item
+# end
+# end
+# end
+# }
+# }
+#
+
+ }
+ }
+
+ end
+
+ }
+ }
+end
+
+def main
+ #input = preprocess 'ich sah ein kleines haus'
+ #input = preprocess 'lebensmittel schuld an europäischer inflation'
+ input = preprocess 'offizielle prognosen sind von nur 3 prozent ausgegangen , meldete bloomberg .'
+ n = input.size
+ g = Grammar.new 'example/grammar.3.gz'
+ #g.add_glue_rules
+ passive_chart = Chart.new n
+ active_chart = Chart.new n
+ init input, n, active_chart, passive_chart, g
+ parse input, n, active_chart, passive_chart, g
+ #puts "---\npassive chart"
+ #visit(n, n, 0) { |i,j| puts "#{i},#{j}"; passive_chart.at(i,j).each { |item| puts ' '+item.to_s if item.span.left==i&&item.span.right==j }; puts }
+ #puts "\nactive chart"
+ #visit(n, n, 0) { |i,j| puts "#{i},#{j}"; active_chart.at(i,j).each { |item| puts ' '+item.to_s }; puts }
+
+end
+
+
+main
+
diff --git a/intersect2.rb b/intersect2.rb
new file mode 100644
index 0000000..0d82c1d
--- /dev/null
+++ b/intersect2.rb
@@ -0,0 +1,176 @@
+#!/usr/bin/env ruby
+
+require_relative './grammar.rb'
+STDOUT.sync = true
+
+
+class Chart
+
+ def initialize n
+ @m = []
+ (n+1).times {
+ _ = []
+ (n+1).times { _ << [] }
+ @m << _
+ }
+ @b = {}
+ end
+
+ def at i, j
+ @m[i][j]
+ end
+
+ def add i, j, item
+ at(i,j) << item
+ @b["#{i},#{j},#{item.lhs.symbol}"] = true
+ end
+
+ def has symbol, i, j
+ return @b["#{i},#{j},#{symbol}"]
+ end
+end
+
+class Span
+ attr_accessor :left, :right
+
+ def initialize left=nil, right=nil
+ @left = left
+ @right = right
+ end
+end
+
+class Item < Rule
+ attr_accessor :lhs, :rhs, :span, :dot
+
+ def initialize rule_or_item, left, right, dot
+ @lhs = rule_or_item.lhs
+ @rhs = rule_or_item.rhs.dup
+ @span = Span.new left, right
+ @dot = dot
+ end
+
+ def to_s
+ "#{lhs} -> #{rhs.map{|i|i.to_s}.insert(@dot,'*').join ' '} [dot@#{@dot}] [arity=#{arity}] (#{@span.left}, #{@span.right})"
+ end
+end
+
+def init input, n, active_chart, passive_chart, grammar
+ grammar.flat.each { |r|
+ input.each_index { |i|
+ if input[i, r.rhs.size] == r.rhs.map { |x| x.word }
+ new_item = Item.new r, i, i+r.rhs.size, r.rhs.size
+ passive_chart.add i, i+r.rhs.size, new_item
+ end
+ }
+ }
+end
+
+def scan item, input, limit, passive_chart
+ while item.rhs[item.dot].class == T && item.span.right < limit
+ if item.rhs[item.dot].word == input[item.span.right]
+ item.dot += 1
+ item.span.right += 1
+ else
+ break
+ end
+ end
+end
+
+def parse input, n, active_chart, passive_chart, grammar
+ 2.upto(n) { |span| # outer loop
+ 0.upto(n-span) { |k|
+
+ puts " span(#{k},#{k+span})"
+
+ # try to apply rules starting with T
+ grammar.mixed.select { |r| r.rhs.first.word == input[k] }.each { |r|
+ new_item = Item.new r, k, k, 0
+ scan new_item, input, k+span, passive_chart
+ active_chart.at(k,k+span) << new_item
+ }
+
+ # seed active chart
+ grammar.rewrite.each { |r|
+ next if r.rhs.size > span
+ active_chart.at(k,k+span) << Item.new(r, k, k, 0)
+ }
+
+ active_chart.at(k,k+span).each { |active_item|
+ next if active_item.rhs[active_item.dot].class==T
+ # inner loop
+ 1.upto(span-1) { |span2|
+ k.upto((k+span)-span2) { |l|
+
+ if passive_chart.has active_item.rhs[active_item.dot].symbol, l, l+span2
+ if l == active_item.span.right
+ new_item = Item.new active_item, active_item.span.left, l+span2, active_item.dot+1
+ scan new_item, input, k+span, passive_chart
+ if new_item.dot == new_item.rhs.size # done with item
+ if new_item.span.left == k && new_item.span.right == k+span
+ passive_chart.add k, k+span, new_item
+ end
+ else
+ if new_item.rhs[new_item.dot].class == NT && new_item.span.right+(new_item.rhs.size-(new_item.dot)) <= k+span
+ active_chart.at(k,k+span) << new_item
+ end
+ end
+ end
+ end
+ }
+ }
+ }
+
+ # 'self-filling' step
+ passive_chart.at(k,k+span).each { |passive_item|
+ active_chart.at(k,k+span).each { |active_item|
+ next if active_item.rhs[active_item.dot].class!=NT
+ if passive_item.lhs.symbol == active_item.rhs[active_item.dot].symbol
+ next if not active_item.span.right==passive_item.span.left
+ new_item = Item.new active_item, active_item.span.left, passive_item.span.right, active_item.dot+1
+ scan new_item, input, k+span, passive_chart
+ if new_item.dot == new_item.rhs.size
+ if new_item.span.left == k && new_item.span.right == k+span
+ passive_chart.add k, k+span, new_item
+ else
+ puts "#{new_item}" # FIXME never happens
+ end
+ else
+ if new_item.rhs[new_item.dot].class == NT && new_item.span.right+(new_item.rhs.size-(new_item.dot)) <= k+span
+ puts "NA@#{k},#{k+span} #{new_item}"
+ active_chart.at(k,k+span) << new_item
+ end
+ end
+ end
+ }
+ }
+ }
+ }
+end
+
+def visit n, depth, skip=0 # FIXME
+ (depth-skip).times { |i|
+ i += skip
+ 0.upto(n-(i+1)) { |j|
+ yield j, j+i+1
+ }
+ }
+end
+
+def main
+ #input = 'ich sah ein kleines haus'.split
+ input = 'lebensmittel schuld an europäischer inflation'.split
+ #input = 'offizielle prognosen sind von nur 3 prozent ausgegangen , meldete bloomberg .'.split
+ n = input.size
+ grammar = Grammar.new 'example/grammar.x'
+ grammar.add_glue_rules
+ passive_chart = Chart.new n
+ active_chart = Chart.new n
+ init input, n, active_chart, passive_chart, grammar
+ parse input, n, active_chart, passive_chart, grammar
+ puts "---\npassive chart"
+ visit(n, n, 0) { |i,j| puts "#{i},#{j}"; passive_chart.at(i,j).each { |item| puts ' '+item.to_s }; puts }
+end
+
+
+main
+