From 129a22cfcc7651daa4b11ed52e7870249f6373a5 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Tue, 16 Sep 2014 10:23:14 +0100 Subject: spring cleaning --- prototype/grammar.rb | 137 ++++++++++++++++++++++++++++++ prototype/hypergraph.rb | 219 ++++++++++++++++++++++++++++++++++++++++++++++++ prototype/parse.rb | 207 +++++++++++++++++++++++++++++++++++++++++++++ prototype/test_hg.rb | 32 +++++++ prototype/test_parse.rb | 49 +++++++++++ prototype/weaver.rb | 70 ++++++++++++++++ 6 files changed, 714 insertions(+) create mode 100644 prototype/grammar.rb create mode 100644 prototype/hypergraph.rb create mode 100644 prototype/parse.rb create mode 100755 prototype/test_hg.rb create mode 100755 prototype/test_parse.rb create mode 100755 prototype/weaver.rb (limited to 'prototype') diff --git a/prototype/grammar.rb b/prototype/grammar.rb new file mode 100644 index 0000000..42ffbc0 --- /dev/null +++ b/prototype/grammar.rb @@ -0,0 +1,137 @@ +module Grammar + +class T + attr_accessor :word + + def initialize word + @word = word + end + + def to_s + @word + end +end + +class NT + attr_accessor :symbol, :index + + def initialize symbol=nil, index=-1 + @symbol = symbol + @index = index + end + + #FIXME? symbol should not contain [, ] or ", + # else we're in trouble + def from_s s + @symbol, @index = s.delete('[]').split ',' + @symbol.strip! + @index = @index.to_i-1 + end + + def self.from_s s + new = NT.new + new.from_s s + return new + end + + #FIXME? indexed by 1 + def to_s + return "[#{@symbol},#{@index+1}]" if @index>=0 + return "[#{@symbol}]" + end +end + +class Rule + attr_accessor :lhs, :rhs, :target, :map, :f + + def initialize lhs=NT.new, rhs=[], target=[], map=[], f=SparseVector.new, arity=0 + @lhs = lhs + @rhs = rhs + @target = target + @map = map + @f = f + @arity = arity + end + + def read_rhs_ s, make_meta=false + a = [] + s.split.map { |x| + x.strip! + if x[0] == '[' && x[x.size-1] == ']' + a << NT.from_s(x) + if make_meta + @map << a.last.index + @arity += 1 + end + else + a << T.new(x) + end + } + return a + end + + def from_s s + lhs, rhs, target, f = splitpipe s, 3 + @lhs = NT.from_s lhs + @rhs = read_rhs_ rhs, true + @target = read_rhs_ target + @f = (f ? SparseVector.from_kv(f, '=', ' ') : SparseVector.new) + end + + def self.from_s s + r = Rule.new + r.from_s s + return r + end + + def to_s + "#{@lhs.to_s} ||| #{@rhs.map { |x| x.to_s }.join ' '} ||| #{@target.map { |x| x.to_s }.join ' '}" + end +end + +class Grammar + attr_accessor :rules, :start_nt, :start_t, :flat + + def initialize fn + @rules = []; @start_nt = []; @start_t = []; @flat = [] + n = 0 + ReadFile.readlines_strip(fn).each_with_index { |s,i| + STDERR.write '.'; STDERR.write " #{i+1}\n" if (i+1)%40==0 + n += 1 + @rules << Rule.from_s(s) + if @rules.last.rhs.first.class == NT + @start_nt << @rules.last + else + if rules.last.arity == 0 + @flat << @rules.last + else + @start_t << @rules.last + end + end + } + STDERR.write " #{n}\n" + end + + def to_s + @rules.map { |r| r.to_s }.join "\n" + end + + def add_glue + @rules.map { |r| r.lhs.symbol }.select { |s| s != 'S' }.uniq.each { |symbol| + @rules << Rule.new(NT.new('S'), [NT.new(symbol, 0)], [NT.new(symbol, 0)], [0]) + @start_nt << @rules.last + @rules << Rule.new(NT.new('S'), [NT.new('S', 0), NT.new('X', 1)], [NT.new('S', 0), NT.new('X', 1)], [0, 1]) + @start_nt << @rules.last + } + end + + def add_pass_through a + a.each { |word| + @rules << Rule.new(NT.new('X'), [T.new(word)], [T.new(word)]) + @flat << @rules.last + } + end +end + +end #module + diff --git a/prototype/hypergraph.rb b/prototype/hypergraph.rb new file mode 100644 index 0000000..d43d5ee --- /dev/null +++ b/prototype/hypergraph.rb @@ -0,0 +1,219 @@ +require 'zipf' +require_relative 'grammar' + +module HG + +class HG::Node + attr_accessor :id, :symbol, :left, :right, :outgoing, :incoming, :score + + def initialize id=nil, symbol='', span=[-1,-1], outgoing=[], incoming=[], score=nil + @id = id + @symbol = symbol + @left = span[0] + @right = span[1] + @outgoing = outgoing + @incoming = incoming + @score = score + end + + def to_s + "Node" + end +end + +class HG::Hypergraph + attr_accessor :nodes, :edges, :nodes_by_id + + def initialize nodes=[], edges=[], nodes_by_id={} + @nodes = nodes + @edges = edges + @nodes_by_id = nodes_by_id + @arity_ = nil + end + + def arity + @arity_ = @edges.map { |e| e.arity }.max if !@arity_ + return @arity_ + end + + def reset + @edges.each { |e| e.mark = 0 } + end + + def to_s + "Hypergraph" + end + + def to_json weights=nil + json_s = "{\n" + json_s += "\"weights\":#{(weights ? weights.to_json : '{}')},\n" + json_s += "\"nodes\":\n" + json_s += "[\n" + json_s += @nodes.map { |n| + "{ \"id\":#{n.id}, \"cat\":\"#{n.symbol.to_json.slice(1..-1).chomp('"')}\", \"span\":[#{n.left},#{n.right}] }" + }.join ",\n" + json_s += "\n],\n" + json_s += "\"edges\":\n" + json_s += "[\n" + json_s += @edges.map { |e| + "{ \"head\":#{e.head.id}, \"rule\":#{e.rule.to_json}, \"tails\":#{e.tails.map{ |n| n.id }}, \"f\":#{(e.f ? e.f.to_json : '{}')} }" + }.join ",\n" + json_s += "\n]\n" + json_s += "}\n" + + return json_s + + end +end + +class HG::Hyperedge + attr_accessor :head, :tails, :score, :f, :mark, :rule + + def initialize head=Node.new, tails=[], score=0.0, f=SparseVector.new, rule=nil + @head = head + @tails = tails + @score = score + @f = f + @mark = 0 + @rule = (rule.class==String ? Grammar::Rule.from_s(rule) : rule) + end + + def arity + return @tails.size + end + + def marked? + arity == @mark + end + + def to_s + "Hyperedge" + end +end + +def HG::topological_sort nodes + sorted = [] + s = nodes.reject { |n| !n.incoming.empty? } + while !s.empty? + sorted << s.shift + sorted.last.outgoing.each { |e| + next if e.marked? + e.mark += 1 + s << e.head if e.head.incoming.reject{ |f| f.marked? }.empty? + } + end + return sorted +end + +def HG::init nodes, semiring, root + nodes.each { |n| n.score=semiring.null } + root.score = semiring.one +end + +def HG::viterbi hypergraph, root, semiring=ViterbiSemiring.new + toposorted = topological_sort hypergraph.nodes + init toposorted, semiring, root + toposorted.each { |n| + n.incoming.each { |e| + s = semiring.one + e.tails.each { |m| + s = semiring.multiply.call(s, m.score) + } + n.score = semiring.add.call(n.score, semiring.multiply.call(s, e.score)) + } + } +end + +def HG::viterbi_path hypergraph, root, semiring=ViterbiSemiring.new + toposorted = topological_sort hypergraph.nodes + init toposorted, semiring, root + best_path = [] + toposorted.each { |n| + best_edge = nil + n.incoming.each { |e| + s = semiring.one + e.tails.each { |m| + s = semiring.multiply.call(s, m.score) + } + if n.score < semiring.multiply.call(s, e.score) # ViterbiSemiring add + best_edge = e + end + n.score = semiring.add.call(n.score, semiring.multiply.call(s, e.score)) + } + best_path << best_edge if best_edge + } + return best_path, toposorted.last.score +end + +def HG::k_best hypergraph, root, semiring=nil + #TODO +end + +def HG::all_paths hypergraph, root + toposorted = topological_sort hypergraph.nodes + paths = [[]] + toposorted.each { |n| + next if n.incoming.empty? + new_paths = [] + while !paths.empty? + p = paths.pop + n.incoming.each { |e| + new_paths << p+[e] + } + end + paths = new_paths + } + return paths +end + +def HG::derive path, cur, carry + edge = path.select { |e| e.head.symbol==cur.symbol \ + && e.head.left==cur.left \ + && e.head.right==cur.right }.first + j = 0 + edge.rule.target.each { |i| + if i.class == Grammar::NT + derive path, edge.tails[edge.rule.map[j]], carry + j += 1 + else + carry << i + end + } + return carry +end + +def HG::read_hypergraph_from_json fn, semiring=RealSemiring.new, log_weights=false + nodes = [] + edges = [] + nodes_by_id = {} + h = JSON.parse File.new(fn).read + w = SparseVector.from_h h['weights'] + h['nodes'].each { |x| + n = Node.new x['id'], x['symbol'], x['span'] + nodes << n + nodes_by_id[n.id] = n + } + h['edges'].each { |x| + e = Hyperedge.new(nodes_by_id[x['head']], \ + x['tails'].map { |j| nodes_by_id[j] }.to_a, \ + (x['score'] ? semiring.convert.call(x['score'].to_f) : nil), \ + (x['f'] ? SparseVector.from_h(x['f']) : SparseVector.new), \ + x['rule']) + if x['f'] + if log_weights + e.score = Math.exp(w.dot(e.f)) + else + e.score = w.dot(e.f) + end + end + e.tails.each { |m| + m.outgoing << e + } + e.head.incoming << e + edges << e + } + return Hypergraph.new(nodes, edges), nodes_by_id +end + +end #module + diff --git a/prototype/parse.rb b/prototype/parse.rb new file mode 100644 index 0000000..adf2b91 --- /dev/null +++ b/prototype/parse.rb @@ -0,0 +1,207 @@ +require 'zipf' +require_relative 'grammar' +require_relative 'hypergraph' + +module Parse + +def Parse::visit i, l, r, x=0 + i.upto(r-x) { |span| + l.upto(r-span) { |k| + yield k, k+span + } + } +end + +class Chart + + def initialize n + @n = n + @m = [] + (n+1).times { + a = [] + (n+1).times { a << [] } + @m << a + } + @b = {} + end + + def at i, j + @m[i][j] + end + + def add item, i, j + at(i,j) << item + @b["#{i},#{j},#{item.lhs.symbol}"] = true + end + + def has symbol, i, j + return @b["#{i},#{j},#{symbol}"] + end + + def to_hg weights=SparseVector.new + nodes = [] + edges = [] + nodes_by_id = {} + nodes << HG::Node.new(-1, "root", [-1,-1]) + nodes_by_id[-1] = nodes.last + id = 0 + seen = {} + Parse::visit(1, 0, @n) { |i,j| + self.at(i,j).each { |item| + _ = "#{item.lhs.symbol},#{i},#{j}" + if !seen[_] + nodes << HG::Node.new(id, item.lhs.symbol, [i,j]) + nodes_by_id[id] = nodes.last + seen[_] = id + id += 1 + end + } + } + + Parse::visit(1, 0, @n) { |i,j| + self.at(i,j).each { |item| + edges << HG::Hyperedge.new(nodes_by_id[seen[item.lhs.symbol+','+i.to_s+','+j.to_s]], \ + (item.tail_spans.empty? ? [nodes_by_id[-1]] : item.rhs.zip((0..item.rhs.size-1).map{|q| item.tail_spans[q] }).select{|x| x[0].class==Grammar::NT }.map{|x| nodes_by_id[seen["#{x[0].symbol},#{x[1].left},#{x[1].right}"]]}), \ + Math.exp(weights.dot(item.f)), + item.f, + Grammar::Rule.new(item.lhs, item.rhs, item.target, item.map, item.f), \ + ) + edges.last.head.incoming << edges.last + edges.last.tails.each { |n| n.outgoing << edges.last } + } + } + return HG::Hypergraph.new(nodes, edges, nodes_by_id) + end +end + +Span = Struct.new(:left, :right) + +class Item < Grammar::Rule + attr_accessor :left, :right, :tail_spans, :dot, :f + + def initialize rule_or_item, left, right, dot + @lhs = Grammar::NT.new rule_or_item.lhs.symbol, rule_or_item.lhs.index + @left = left + @right = right + @rhs = [] + @tail_spans = {} # refers to source side, use @map + @f = rule_or_item.f + @map = (rule_or_item.map ? rule_or_item.map.dup : []) + rule_or_item.rhs.each_with_index { |x,i| # duplicate rhs partially + @rhs << x + if x.class == Grammar::NT + begin + if i >= dot + @tail_spans[i] = Span.new(-1, -1) + else + @tail_spans[i] = rule_or_item.tail_spans[i].dup + end + rescue + @tail_spans[i] = Span.new(-1, -1) + end + end + } + @dot = dot + @target = rule_or_item.target + end + + def to_s + "(#{@left}, #{@right}) [#{tail_spans.map{|k,v| k.to_s+'('+v.left.to_s+','+v.right.to_s+')'}.join ' '}] {#{@map.to_s.delete('[]')}} #{lhs} -> #{rhs.map{|i|i.to_s}.insert(@dot,'*').join ' '} [dot@#{@dot}] ||| #{@target.map{|x|x.to_s}.join ' '}" + end +end + +def Parse::init input, n, active_chart, passive_chart, grammar + grammar.flat.each { |r| + input.each_index { |i| + if input[i, r.rhs.size] == r.rhs.map { |x| x.word } + passive_chart.add Item.new(r, i, i+r.rhs.size, r.rhs.size), i, i+r.rhs.size + end + } + } +end + +def Parse::scan item, input, limit, passive_chart + while item.rhs[item.dot].class == Grammar::T + return false if item.right==limit + if item.rhs[item.dot].word == input[item.right] + item.dot += 1 + item.right += 1 + else + return false + end + end + return true +end + +def Parse::parse input, n, active_chart, passive_chart, grammar + visit(1, 0, n) { |i,j| + + STDERR.write " span(#{i},#{j})\n" + + # try to apply rules starting with T + grammar.start_t.select { |r| r.rhs.first.word == input[i] }.each { |r| + new_item = Item.new r, i, i, 0 + active_chart.at(i,j) << new_item if scan new_item, input, j, passive_chart + } + + # seed active chart + grammar.start_nt.each { |r| + next if r.rhs.size > j-i + active_chart.at(i,j) << Item.new(r, i, i, 0) + } + + # parse + new_symbols = [] + remaining_items = [] + while !active_chart.at(i,j).empty? + active_item = active_chart.at(i,j).pop + advanced = false + visit(1, i, j, 1) { |k,l| + if passive_chart.has active_item.rhs[active_item.dot].symbol, k, l + if k == active_item.right + new_item = Item.new active_item, active_item.left, l, active_item.dot+1 + new_item.tail_spans[new_item.dot-1] = Span.new(k,l) + if scan new_item, input, j, passive_chart + if new_item.dot == new_item.rhs.size + if new_item.left == i && new_item.right == j + new_symbols << new_item.lhs.symbol if !new_symbols.include? new_item.lhs.symbol + passive_chart.add new_item, i, j + advanced = true + end + else + if new_item.right+(new_item.rhs.size-(new_item.dot)) <= j + active_chart.at(i,j) << new_item + advanced = true + end + end + end + end + end + } + if !advanced + remaining_items << active_item + end + end + + + # 'self-filling' step + new_symbols.each { |s| + remaining_items.each { |item| + next if item.dot!=0 + next if item.rhs[item.dot].class!=Grammar::NT + if item.rhs[item.dot].symbol == s + new_item = Item.new item, i, j, item.dot+1 + new_item.tail_spans[new_item.dot-1] = Span.new(i,j) + if new_item.dot==new_item.rhs.size + new_symbols << new_item.lhs.symbol if !new_symbols.include? new_item.lhs.symbol + passive_chart.add new_item, i, j + end + end + } + } + + } +end + +end #module + diff --git a/prototype/test_hg.rb b/prototype/test_hg.rb new file mode 100755 index 0000000..4be72bd --- /dev/null +++ b/prototype/test_hg.rb @@ -0,0 +1,32 @@ +#!/usr/bin/env ruby + +require_relative 'hypergraph' + + +def main + # viterbi + semiring = ViterbiSemiring.new + hypergraph, nodes_by_id = HG::read_hypergraph_from_json('../example/toy/toy.json', semiring, true) + #hypergraph, nodes_by_id = HG::read_hypergraph_from_json('../example/toy/toy-test.json', semiring, true) + #hypergraph, nodes_by_id = HG::read_hypergraph_from_json('../example/glue/glue.json', semiring, true) + #hypergraph, nodes_by_id = HG::read_hypergraph_from_json('../example/3/3.json', semiring, true) + path, score = HG::viterbi_path hypergraph, nodes_by_id[-1], semiring + s = HG::derive path, path.last.head, [] + path.each { |e| puts "#{e.rule}" } + puts "---" + puts "#{s.map { |i| i.word }.join ' '}" + puts Math.log score + puts + + # all paths + hypergraph.reset + paths = HG::all_paths hypergraph, nodes_by_id[-1] + paths.each_with_index { |p,i| + s = HG::derive p, p.last.head, [] + puts "#{i+1}. #{s.map { |x| x.word }.join ' '}" + } +end + + +main + diff --git a/prototype/test_parse.rb b/prototype/test_parse.rb new file mode 100755 index 0000000..918101b --- /dev/null +++ b/prototype/test_parse.rb @@ -0,0 +1,49 @@ +#!/usr/bin/env ruby + +require_relative 'parse' + + +def main + STDERR.write "> reading input from TODO\n" + input = 'ich sah ein kleines haus'.split + #input = 'lebensmittel schuld an europäischer inflation'.split + #input = 'offizielle prognosen sind von nur 3 prozent ausgegangen , meldete bloomberg .'.split + n = input.size + + STDERR.write "> reading grammar\n" + grammar = Grammar::Grammar.new '../example/toy/grammar' + #grammar = Grammar::Grammar.new '../example/toy/grammar-test' + #grammar = Grammar::Grammar.new '../example/glue/grammar' + #grammar = Grammar::Grammar.new '../example/3/grammar.3.gz' + + STDERR.write ">> adding glue grammar\n" + #grammar.add_glue_rules + + STDERR.write ">> adding pass-through grammar\n" + #grammar.add_pass_through_rules input + + STDERR.write "> initializing charts\n" + passive_chart = Parse::Chart.new n + active_chart = Parse::Chart.new n + Parse::init input, n, active_chart, passive_chart, grammar + + STDERR.write "> parsing\n" + Parse::parse input, n, active_chart, passive_chart, grammar + + puts "\n---\npassive chart" + Parse::visit(1, 0, 5) { |i,j| puts "#{i},#{j}"; passive_chart.at(i,j).each { |item| puts " #{j} #{item.to_s}" }; puts } + + weights_file = '../example/toy/weights' + #weights_file = '../example/glue/weights' + #weights_file = '../example/3/weights.init' + weights = SparseVector.from_kv(ReadFile.read(weights_file), ' ', "\n") + if !weights + weights = SparseVector.new + end + + puts passive_chart.to_hg.to_json weights +end + + +main + diff --git a/prototype/weaver.rb b/prototype/weaver.rb new file mode 100755 index 0000000..966d4c8 --- /dev/null +++ b/prototype/weaver.rb @@ -0,0 +1,70 @@ +#!/usr/bin/env ruby + +require 'trollop' +require 'xmlsimple' +require_relative 'parse' + +def read_grammar fn, add_glue, add_pass_through, input=nil + STDERR.write "> reading grammar '#{fn}'\n" + grammar = Grammar::Grammar.new fn + if add_glue + STDERR.write ">> adding glue rules\n" + grammar.add_glue + end + if add_pass_through + STDERR.write ">> adding pass-through rules\n" + grammar.add_pass_through input + end + return grammar +end + +def main + cfg = Trollop::options do + opt :input, "", :type => :string, :default => '-', :short => '-i' + opt :grammar, "", :type => :string, :default => nil, :short => '-g' + opt :weights, "", :type => :string, :default => nil, :short => '-w' + opt :add_glue, "", :type => :bool, :default => false, :short => '-l' + opt :add_pass_through, "", :type => :bool, :default => false, :short => '-p' + end + + grammar = nil + if cfg[:grammar] + grammar = read_grammar cfg[:grammar], cfg[:add_glue], cfg[:add_pass_through] + end + + STDERR.write "> reading input from '#{cfg[:input]}'\n" + ReadFile.readlines_strip(cfg[:input]).each { |input| + + x = XmlSimple.xml_in(input) + input = x['content'].split + n = input.size + + if x['grammar'] + grammar = read_grammar x['grammar'], cfg[:add_glue], cfg[:add_pass_through], input + end + + STDERR.write "> initializing charts\n" + passive_chart = Parse::Chart.new n + active_chart = Parse::Chart.new n + Parse::init input, n, active_chart, passive_chart, grammar + + STDERR.write "> parsing\n" + Parse::parse input, n, active_chart, passive_chart, grammar + + weights = SparseVector.from_kv(ReadFile.read(cfg[:weights]), ' ', "\n") + if !weights + weights = SparseVector.new + end + + hypergraph = passive_chart.to_hg weights + + STDERR.write "> viterbi\n" + semiring = ViterbiSemiring.new + path, score = HG::viterbi_path hypergraph, hypergraph.nodes_by_id[-1], semiring + s = HG::derive path, path.last.head, [] + STDERR.write " #{s.map { |i| i.word }.join ' '} ||| #{Math.log score}\n" + } +end + +main + -- cgit v1.2.3