diff options
-rw-r--r-- | example/3/cdec.ini | 2 | ||||
-rw-r--r-- | example/3/in.sgm | 1 | ||||
-rw-r--r-- | example/glue/in | 1 | ||||
-rw-r--r-- | example/glue/in.sgm | 1 | ||||
-rw-r--r-- | example/toy/in.sgm | 1 | ||||
-rwxr-xr-x | main.rb | 75 | ||||
-rw-r--r-- | parse.rb | 36 | ||||
-rwxr-xr-x | test/hg.rb | 29 | ||||
-rwxr-xr-x | test/parse.rb (renamed from test_parse.rb) | 22 | ||||
-rwxr-xr-x | test_hg.rb | 24 |
10 files changed, 156 insertions, 36 deletions
diff --git a/example/3/cdec.ini b/example/3/cdec.ini index ee65b4e..4491e78 100644 --- a/example/3/cdec.ini +++ b/example/3/cdec.ini @@ -1,5 +1,5 @@ formalism=scfg intersection_strategy=full -grammar=grammars/grammar.3.gz +grammar=grammar.3.gz #add_pass_through_rules=true diff --git a/example/3/in.sgm b/example/3/in.sgm new file mode 100644 index 0000000..f6fde68 --- /dev/null +++ b/example/3/in.sgm @@ -0,0 +1 @@ +<seg id='0' grammar='example/3/grammar.3.gz'>offizielle prognosen sind von nur 3 prozent ausgegangen , meldete bloomberg .</seg> diff --git a/example/glue/in b/example/glue/in new file mode 100644 index 0000000..2d49127 --- /dev/null +++ b/example/glue/in @@ -0,0 +1 @@ +lebensmittel schuld an europäischer inflation diff --git a/example/glue/in.sgm b/example/glue/in.sgm new file mode 100644 index 0000000..2f1a89b --- /dev/null +++ b/example/glue/in.sgm @@ -0,0 +1 @@ +<seg id='0' grammar='example/glue/grammar'>lebensmittel schuld an europäischer inflation</seg> diff --git a/example/toy/in.sgm b/example/toy/in.sgm new file mode 100644 index 0000000..561d346 --- /dev/null +++ b/example/toy/in.sgm @@ -0,0 +1 @@ +<seg id='0' grammar='example/toy/grammar'>ich sah ein kleines haus</seg> @@ -0,0 +1,75 @@ +#!/usr/bin/env ruby + +require 'trollop' +require 'xmlsimple' +require_relative 'parse' + + +def read_grammar fn, add_glue, add_pass_through + STDERR.write "> reading grammar '#{fn}'\n" + grammar = Grammar::Grammar.new fn + if add_glue + STDERR.write ">> adding glue grammar\n" + grammar.add_glue_rules + end + if add_pass_through + STDERR.write ">> adding pass-through grammar\n" + grammar.add_pass_through_rules input + end + return grammar +end + +def main + cfg = Trollop::options do + opt :input, "", :type => :string, :default => '-', :short => '-i' + opt :grammar, "", :type => :string, :default => nil, :short => '-g' + opt :weights, "", :type => :string, :default => nil, :short => '-w' + opt :add_glue, "", :type => :bool, :default => false, :short => '-h' + opt :add_pass_through, "", :type => :bool, :default => false, :short => '-p' + end + + grammar = nil + if cfg[:grammar] + grammar = read_grammar cfg[:grammar], cfg[:add_glue], cfg[:add_pass_through] + end + + STDERR.write "> reading input from '#{cfg[:input]}'\n" + ReadFile.readlines_strip(cfg[:input]).each { |input| + + x = XmlSimple.xml_in(input) + input = x['content'].split + n = input.size + + if x['grammar'] + grammar = read_grammar x['grammar'], cfg[:add_glue], cfg[:add_pass_through] + end + + STDERR.write "> initializing charts\n" + passive_chart = Parse::Chart.new n + active_chart = Parse::Chart.new n + Parse::init input, n, active_chart, passive_chart, grammar + + STDERR.write "> parsing\n" + Parse::parse input, n, active_chart, passive_chart, grammar + + weights = SparseVector.from_kv(ReadFile.read(cfg[:weights]), ' ', "\n") + if !weights + weights = SparseVector.new + end + + hypergraph, nodes_by_id = passive_chart.to_hg weights + + STDERR.write "> viterbi\n" + semiring = ViterbiSemiring.new + path, score = HG::viterbi_path hypergraph, nodes_by_id[-1], semiring + s = HG::derive path, path.last.head, [] + puts "#{s.map { |i| i.word }.join ' '}" + puts Math.log score + puts + + } +end + + +main + @@ -1,5 +1,6 @@ require 'zipf' require_relative 'grammar' +require_relative 'hg' module Parse @@ -82,6 +83,41 @@ class Chart return json_s end + + def to_hg weights + nodes = [] + edges = [] + nodes_by_id = {} + nodes << HG::Node.new(-1, "root", [-1,-1]) + nodes_by_id[-1] = nodes.last + id = 0 + seen = {} + Parse::visit(1, 0, @n) { |i,j| + self.at(i,j).each { |item| + _ = "#{item.lhs.symbol},#{i},#{j}" + if !seen[_] + nodes << HG::Node.new(id, item.lhs.symbol, [i,j]) + nodes_by_id[id] = nodes.last + seen[_] = id + id += 1 + end + } + } + + Parse::visit(1, 0, @n) { |i,j| + self.at(i,j).each { |item| + edges << HG::Hyperedge.new(nodes_by_id[seen[item.lhs.symbol+','+i.to_s+','+j.to_s]], \ + (item.tail_spans.empty? ? [nodes_by_id[-1]] : item.rhs.zip((0..item.rhs.size-1).map{|q| item.tail_spans[q] }).select{|x| x[0].class==Grammar::NT }.map{|x| nodes_by_id[seen["#{x[0].symbol},#{x[1].left},#{x[1].right}"]]}), \ + Math.exp(weights.dot(item.f)), + item.f, + Grammar::Rule.new(item.lhs, item.rhs, item.target, item.map, item.f), \ + ) + edges.last.head.incoming << edges.last + edges.last.tails.each { |n| n.outgoing << edges.last } + } + } + return HG::Hypergraph.new(nodes, edges), nodes_by_id + end end Span = Struct.new(:left, :right) diff --git a/test/hg.rb b/test/hg.rb new file mode 100755 index 0000000..16e6b6f --- /dev/null +++ b/test/hg.rb @@ -0,0 +1,29 @@ +#!/usr/bin/env ruby + +require_relative '../hg' + + +def main + # viterbi + semiring = ViterbiSemiring.new + hypergraph, nodes_by_id = HG::read_hypergraph_from_json('../example/toy/toy.json', semiring, true) + #hypergraph, nodes_by_id = HG::read_hypergraph_from_json('../example/glue/glue.json', semiring, true) + #hypergraph, nodes_by_id = HG::read_hypergraph_from_json('../example/3/3.json', semiring, true) + path, score = HG::viterbi_path hypergraph, nodes_by_id[-1], semiring + s = HG::derive path, path.last.head, [] + puts "#{s.map { |i| i.word }.join ' '}" + puts Math.log score + puts + + # all paths + hypergraph.reset + paths = HG::all_paths hypergraph, nodes_by_id[-1] + paths.each_with_index { |p,i| + s = HG::derive p, p.last.head, [] + puts "#{i+1}. #{s.map { |x| x.word }.join ' '}" + } +end + + +main + diff --git a/test_parse.rb b/test/parse.rb index 835b08a..e139ea4 100755 --- a/test_parse.rb +++ b/test/parse.rb @@ -1,19 +1,19 @@ #!/usr/bin/env ruby -require_relative 'parse' +require_relative '../parse' def main STDERR.write "> reading input from TODO\n" - #input = 'ich sah ein kleines haus'.split + input = 'ich sah ein kleines haus'.split #input = 'lebensmittel schuld an europäischer inflation'.split - input = 'offizielle prognosen sind von nur 3 prozent ausgegangen , meldete bloomberg .'.split + #input = 'offizielle prognosen sind von nur 3 prozent ausgegangen , meldete bloomberg .'.split n = input.size STDERR.write "> reading grammar\n" - #grammar = Grammar::Grammar.new 'example/toy/grammar' - #grammar = Grammar::Grammar.new 'example/glue/grammar' - grammar = Grammar::Grammar.new 'example/3/grammar.3.gz' + grammar = Grammar::Grammar.new '../example/toy/grammar' + #grammar = Grammar::Grammar.new '../example/glue/grammar' + #grammar = Grammar::Grammar.new '../example/3/grammar.3.gz' STDERR.write ">> adding glue grammar\n" #grammar.add_glue_rules @@ -29,12 +29,12 @@ def main STDERR.write "> parsing\n" Parse::parse input, n, active_chart, passive_chart, grammar - #puts "\n---\npassive chart" - #Parse::visit(1, 0, 5) { |i,j| puts "#{i},#{j}"; passive_chart.at(i,j).each { |item| puts " #{j} #{item.to_s}" }; puts } + puts "\n---\npassive chart" + Parse::visit(1, 0, 5) { |i,j| puts "#{i},#{j}"; passive_chart.at(i,j).each { |item| puts " #{j} #{item.to_s}" }; puts } - weights_file = 'example/toy/weights' - #weights_file = 'example/glue/weights' - #weights_file = 'example/3/weights.init' + weights_file = '../example/toy/weights' + #weights_file = '../example/glue/weights' + #weights_file = '../example/3/weights.init' weights = SparseVector.from_kv(ReadFile.read(weights_file), ' ', "\n") if !weights weights = SparseVector.new diff --git a/test_hg.rb b/test_hg.rb deleted file mode 100755 index 14fe011..0000000 --- a/test_hg.rb +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env ruby - -require_relative 'hg' - - -# viterbi -semiring = ViterbiSemiring.new -hypergraph, nodes_by_id = HG::read_hypergraph_from_json('example/toy/toy.json', semiring, true) -#hypergraph, nodes_by_id = HG::read_hypergraph_from_json('example/glue/glue.json', semiring, true) -#hypergraph, nodes_by_id = HG::read_hypergraph_from_json('example/3/3.json', semiring, true) -path, score = HG::viterbi_path hypergraph, nodes_by_id[-1], semiring -s = HG::derive path, path.last.head, [] -puts "#{s.map { |i| i.word }.join ' '}" -puts Math.log score -puts - -# all paths -hypergraph.reset -paths = HG::all_paths hypergraph, nodes_by_id[-1] -paths.each_with_index { |p,i| - s = HG::derive p, p.last.head, [] - puts "#{i+1}. #{s.map { |x| x.word }.join ' '}" -} - |