From 24e296e97c32fdf6c3b7fd5ecb5596165d4dad14 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Sat, 7 Jun 2014 11:44:02 +0200 Subject: better json format, class hierarchy untangled --- example/json/test.json | 44 ++++++++++++++-------------- grammar.rb | 30 +++++++++---------- hg.rb | 78 ++++++++++++++++++++----------------------------- test.rb | 9 +++--- util/cdec_hg_to_json.py | 30 +++++++------------ 5 files changed, 81 insertions(+), 110 deletions(-) diff --git a/example/json/test.json b/example/json/test.json index aa0b45e..d865fa0 100644 --- a/example/json/test.json +++ b/example/json/test.json @@ -1,33 +1,33 @@ { "weights":{ -"PhraseModel_0":0.0, "PhraseModel_1":0.0, "PhraseModel_2":0.0, "PhraseModel_3":0.0, "PhraseModel_4":0.0, "PhraseModel_5":0.0, "PhraseModel_6":0.0, "PhraseModel_7":0.0, "PhraseModel_8":0.0, "PhraseModel_9":0.0, "PhraseModel_10":0.0, "PhraseModel_11":0.0, "PhraseModel_12":0.0, "PhraseModel_13":0.0, "PhraseModel_14":0.0, "PhraseModel_15":0.0, "PhraseModel_16":0.0, "PhraseModel_17":0.0, "PhraseModel_18":0.0, "PhraseModel_19":0.0, "PhraseModel_20":0.0, "PhraseModel_21":0.0, "PhraseModel_22":0.0, "PhraseModel_23":0.0, "PhraseModel_24":0.0, "PhraseModel_25":0.0, "PhraseModel_26":0.0, "PhraseModel_27":0.0, "PhraseModel_28":0.0, "PhraseModel_29":0.0, "PhraseModel_30":0.0, "PhraseModel_31":0.0, "PhraseModel_32":0.0, "PhraseModel_33":0.0, "PhraseModel_34":0.0, "PhraseModel_35":0.0, "PhraseModel_36":0.0, "PhraseModel_37":0.0, "PhraseModel_38":0.0, "PhraseModel_39":0.0, "PhraseModel_40":0.0, "PhraseModel_41":0.0, "PhraseModel_42":0.0, "PhraseModel_43":0.0, "PhraseModel_44":0.0, "PhraseModel_45":0.0, "PhraseModel_46":0.0, "PhraseModel_47":0.0, "PhraseModel_48":0.0, "PhraseModel_49":0.0, "PhraseModel_50":0.0, "PhraseModel_51":0.0, "PhraseModel_52":0.0, "PhraseModel_53":0.0, "PhraseModel_54":0.0, "PhraseModel_55":0.0, "PhraseModel_56":0.0, "PhraseModel_57":0.0, "PhraseModel_58":0.0, "PhraseModel_59":0.0, "PhraseModel_60":0.0, "PhraseModel_61":0.0, "PhraseModel_62":0.0, "PhraseModel_63":0.0, "PhraseModel_64":0.0, "PhraseModel_65":0.0, "PhraseModel_66":0.0, "PhraseModel_67":0.0, "PhraseModel_68":0.0, "PhraseModel_69":0.0, "PhraseModel_70":0.0, "PhraseModel_71":0.0, "PhraseModel_72":0.0, "PhraseModel_73":0.0, "PhraseModel_74":0.0, "PhraseModel_75":0.0, "PhraseModel_76":0.0, "PhraseModel_77":0.0, "PhraseModel_78":0.0, "PhraseModel_79":0.0, "PhraseModel_80":0.0, "PhraseModel_81":0.0, "PhraseModel_82":0.0, "PhraseModel_83":0.0, "PhraseModel_84":0.0, "PhraseModel_85":0.0, "PhraseModel_86":0.0, "PhraseModel_87":0.0, "PhraseModel_88":0.0, "PhraseModel_89":0.0, "PhraseModel_90":0.0, "PhraseModel_91":0.0, "PhraseModel_92":0.0, "PhraseModel_93":0.0, "PhraseModel_94":0.0, "PhraseModel_95":0.0, "PhraseModel_96":0.0, "PhraseModel_97":0.0, "PhraseModel_98":0.0, "PhraseModel_99":0.0, "logp":2.0, "use_i":0.0, "use_a":0.0, "use_house":15.0, "use_shell":1.0 +"logp":2.0, "use_shell":1.0 }, "nodes": [ -{ "id":-1 }, -{ "id":0 }, -{ "id":1 }, -{ "id":2 }, -{ "id":3 }, -{ "id":4 }, -{ "id":5 }, -{ "id":6 }, -{ "id":7 } +{ "id":-1, "cat":"root", "span":[-1,-1] }, +{ "id":0, "cat":"NP", "span":[0,1] }, +{ "id":1, "cat":"V", "span":[1,2] }, +{ "id":2, "cat":"JJ", "span":[3,4] }, +{ "id":3, "cat":"NN", "span":[3,5] }, +{ "id":4, "cat":"NP", "span":[2,5] }, +{ "id":5, "cat":"VP", "span":[1,5] }, +{ "id":6, "cat":"S", "span":[0,5] }, +{ "id":7, "cat":"Goal", "span":[0,5] } ], "edges": [ -{"head":0, "rule":"[NP@0:1] ||| ich ||| i ||| logp=-0.5 use_i=1.0", "tails":[ -1 ], "f":{"logp":-0.5, "use_i":1.0} }, -{"head":1, "rule":"[V@1:2] ||| sah ||| saw ||| logp=-0.25 use_saw=1.0", "tails":[ -1 ], "f":{"logp":-0.25, "use_saw":1.0} }, -{"head":2, "rule":"[JJ@3:4] ||| kleines ||| small ||| logp=0.0 use_small=1.0", "tails":[ -1 ], "f":{"logp":0.0, "use_small":1.0} }, -{"head":2, "rule":"[JJ@3:4] ||| kleines ||| little ||| logp=0.0 use_little=1.0", "tails":[ -1 ], "f":{"logp":0.0, "use_little":1.0} }, -{"head":3, "rule":"[NN@3:5] ||| kleines haus ||| small house ||| logp=0.0 use_house=1.0", "tails":[ -1 ], "f":{"logp":0.0, "use_house":1.0} }, -{"head":3, "rule":"[NN@3:5] ||| kleines haus ||| little house ||| logp=0.0 use_house=1.0", "tails":[ -1 ], "f":{"logp":0.0, "use_house":1.0} }, -{"head":3, "rule":"[NN@3:5] ||| [JJ@3:4,1] haus ||| [JJ@3:4,1] house ||| logp=0.0 use_house=1.0", "tails":[ 2 ], "f":{"logp":0.0, "use_house":1.0} }, -{"head":3, "rule":"[NN@3:5] ||| [JJ@3:4,1] haus ||| [JJ@3:4,1] shell ||| logp=0.0 use_shell=1.0", "tails":[ 2 ], "f":{"logp":0.0, "use_shell":1.0} }, -{"head":4, "rule":"[NP@2:5] ||| ein [NN@3:5,1] ||| a [NN@3:5,1] ||| logp=0.0 use_a=1.0", "tails":[ 3 ], "f":{"logp":0.0, "use_a":1.0} }, -{"head":5, "rule":"[VP@1:5] ||| [V@1:2,1] [NP@2:5,2] ||| [V@1:2,1] [NP@2:5,2] ||| logp=0.0", "tails":[ 1,4 ], "f":{"logp":0.0} }, -{"head":6, "rule":"[S@0:5] ||| [NP@0:1,1] [VP@1:5,2] ||| [NP@0:1,1] [VP@1:5,2] ||| logp=0.0", "tails":[ 0,5 ], "f":{"logp":0.0} }, -{"head":7, "rule":"[Goal@0:5] ||| [S@0:5,1] ||| [S@0:5,1] ||| ", "tails":[ 6 ], "f":{} } +{"head":0, "rule":"[NP] ||| ich ||| i ||| logp=-0.5 use_i=1.0", "tails":[ -1 ], "f":{"logp":-0.5, "use_i":1.0}, "weight":0.367879441171 }, +{"head":1, "rule":"[V] ||| sah ||| saw ||| logp=-0.25 use_saw=1.0", "tails":[ -1 ], "f":{"logp":-0.25, "use_saw":1.0}, "weight":0.606530659713 }, +{"head":2, "rule":"[JJ] ||| kleines ||| small ||| logp=0.0 use_small=1.0", "tails":[ -1 ], "f":{"logp":0.0, "use_small":1.0}, "weight":1.0 }, +{"head":2, "rule":"[JJ] ||| kleines ||| little ||| logp=0.0 use_little=1.0", "tails":[ -1 ], "f":{"logp":0.0, "use_little":1.0}, "weight":1.0 }, +{"head":3, "rule":"[NN] ||| kleines haus ||| small house ||| logp=0.0 use_house=1.0", "tails":[ -1 ], "f":{"logp":0.0, "use_house":1.0}, "weight":1.0 }, +{"head":3, "rule":"[NN] ||| kleines haus ||| little house ||| logp=0.0 use_house=1.0", "tails":[ -1 ], "f":{"logp":0.0, "use_house":1.0}, "weight":1.0 }, +{"head":3, "rule":"[NN] ||| [JJ,1] haus ||| [1] house ||| logp=0.0 use_house=1.0", "tails":[ 2 ], "f":{"logp":0.0, "use_house":1.0}, "weight":1.0 }, +{"head":3, "rule":"[NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0.0 use_shell=1.0", "tails":[ 2 ], "f":{"logp":0.0, "use_shell":1.0}, "weight":2.71828182846 }, +{"head":4, "rule":"[NP] ||| ein [NN,1] ||| a [1] ||| logp=0.0 use_a=1.0", "tails":[ 3 ], "f":{"logp":0.0, "use_a":1.0}, "weight":1.0 }, +{"head":5, "rule":"[VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0.0", "tails":[ 1,4 ], "f":{"logp":0.0}, "weight":1.0 }, +{"head":6, "rule":"[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0.0", "tails":[ 0,5 ], "f":{"logp":0.0}, "weight":1.0 }, +{"head":7, "rule":"[Goal] ||| [S,1] ||| [1] ||| ", "tails":[ 6 ], "f":{}, "weight":1.0 } ] } diff --git a/grammar.rb b/grammar.rb index 70e8fda..003512c 100644 --- a/grammar.rb +++ b/grammar.rb @@ -13,23 +13,17 @@ class T end class NT - attr_accessor :symbol, :index, :left, :right + attr_accessor :symbol, :index - def initialize symbol=nil, index=nil, left=nil, right=nil + def initialize symbol=nil, index=nil @symbol = symbol @index = index - @left = left - @right = right end def from_s s - s.delete! '[]' - @symbol, meta = s.split '@' - if meta - span, index = meta.split ',' - @left, @right = span.split(':').map { |x| x.to_i } - @index = index.to_i - end + @symbol, @index = s.delete('[]').split ',' + @symbol.strip! + @index = @index.to_i-1 end def self.from_s s @@ -39,22 +33,23 @@ class NT end def to_s - "NT(#{@left},#{@right})<#{@symbol},#{@index}>" + "NT<#{@symbol},#{@index}>" end end class Rule attr_accessor :lhs, :rhs, :target, :map - def initialize lhs=nil, rhs=[], target=[] + def initialize lhs=nil, rhs=nil, target=nil, map=nil @lhs = lhs @rhs = rhs @target = target + @map = (map ? map : []) @arity_ = nil end def to_s - "#{@lhs} -> #{@rhs.map{ |i| i.to_s }.join ' '} ||| #{@target.map{ |i| i.to_s }.join ' '} [arity=#{arity}]" + "#{@lhs.to_s} -> #{@rhs.map{ |i| i.to_s }.join ' '} ||| #{@target.map{ |i| i.to_s }.join ' '} [arity=#{arity}]" end def arity @@ -62,12 +57,13 @@ class Rule return @arity_ end - def read_right_ s + def read_right_ s, fill_map=false _ = [] s.split.each { |x| x.strip! if x[0]=='[' && x[x.size-1] == ']' _ << NT.from_s(x) + @map << _.last.index if fill_map else _ << T.new(x) end @@ -79,10 +75,10 @@ class Rule lhs, rhs, target = splitpipe s, 3 @lhs = NT.from_s lhs @rhs = read_right_ rhs - @target = read_right_ target + @target = read_right_ target, true end - def self.from_s s + def self.from_s_x s r = self.new r.from_s s return r diff --git a/hg.rb b/hg.rb index f6af75d..43dbf79 100644 --- a/hg.rb +++ b/hg.rb @@ -8,17 +8,20 @@ module HG class HG::Node - attr_accessor :id, :outgoing, :incoming, :score + attr_accessor :id, :symbol, :left, :right, :outgoing, :incoming, :score - def initialize id=nil, cat=nil, outgoing=[], incoming=[], score=nil - @id = id + def initialize id=nil, symbol='', span=[-1,-1], outgoing=[], incoming=[], score=nil + @id = id + @symbol = symbol + @left = span[0] + @right = span[1] @outgoing = outgoing @incoming = incoming - @score = nil + @score = score end def to_s - "Node" + "Node" end end @@ -41,20 +44,20 @@ class HG::Hypergraph end def to_s - "Hypergraph" + "Hypergraph" end end class HG::Hyperedge attr_accessor :head, :tails, :score, :f, :mark, :rule - def initialize head=nil, tails=[], score=0.0, f=SparseVector.new, rule=nil + def initialize head=Node.new, tails=[], score=0.0, f=SparseVector.new, rule=nil @head = head @tails = tails @score = score @f = f @mark = 0 - @rule = (rule ? Grammar::Rule.from_s(rule) : nil) + @rule = (rule.class==String ? Grammar::Rule.from_s(rule) : rule) end def arity @@ -66,7 +69,7 @@ class HG::Hyperedge end def to_s - "Hyperedge" + "Hyperedge" end end @@ -124,27 +127,6 @@ def HG::viterbi_path hypergraph, root, semiring=ViterbiSemiring.new return best_path, toposorted.last.score end -def HG::viterbi_string hypergraph, root, semiring=ViterbiSemiring.new - toposorted = topological_sort hypergraph.nodes - init toposorted, semiring, root - s = '' - toposorted.each { |n| - best_s = nil - n.incoming.each { |e| - s = semiring.one - e.tails.each { |m| - s = semiring.multiply.call(s, m.score) - } - if n.score < semiring.multiply.call(s, e.score) # ViterbiSemiring add - best_s = e.e - end - n.score = semiring.add.call(n.score, semiring.multiply.call(s, e.score)) - } - s += best_s if best_s - } - return s, toposorted.last.score -end - def HG::all_paths hypergraph, root toposorted = topological_sort hypergraph.nodes paths = [[]] @@ -162,6 +144,22 @@ def HG::all_paths hypergraph, root return paths end +def HG::derive path, cur, carry + edge = path.select { |e| e.head.symbol==cur.symbol \ + && e.head.left==cur.left \ + && e.head.right==cur.right }.first + j = 0 + edge.rule.target.each { |i| + if i.class == Grammar::NT + derive path, edge.tails[j], carry + j += 1 + else + carry << i + end + } + return carry +end + def HG::read_hypergraph_from_json fn, semiring=RealSemiring.new, log_weights=false nodes = [] edges = [] @@ -169,7 +167,7 @@ def HG::read_hypergraph_from_json fn, semiring=RealSemiring.new, log_weights=fal h = JSON.parse File.new(fn).read w = SparseVector.from_h h['weights'] h['nodes'].each { |x| - n = Node.new x['id'] + n = Node.new x['id'], x['symbol'], x['span'] nodes << n nodes_by_id[n.id] = n } @@ -181,9 +179,9 @@ def HG::read_hypergraph_from_json fn, semiring=RealSemiring.new, log_weights=fal x['rule']) if x['f'] if log_weights - e.weight = Math.exp(w.dot(e.f)) + e.score = Math.exp(w.dot(e.f)) else - e.weight = w.dot(e.f) + e.score = w.dot(e.f) end end e.tails.each { |m| @@ -195,20 +193,6 @@ def HG::read_hypergraph_from_json fn, semiring=RealSemiring.new, log_weights=fal return Hypergraph.new(nodes, edges), nodes_by_id end -def HG::derive path, cur, carry - edge = path.select { |e| e.rule.lhs.symbol==cur.symbol \ - && e.rule.lhs.left==cur.left \ - && e.rule.lhs.right==cur.right }.first - edge.rule.target.each { |i| - if i.class == Grammar::NT - derive path, i, carry - else - carry << i - end - } - return carry -end - end #module diff --git a/test.rb b/test.rb index d9fbdfa..e65c9b3 100755 --- a/test.rb +++ b/test.rb @@ -5,14 +5,15 @@ require_relative 'hg' semiring = ViterbiSemiring.new hypergraph, nodes_by_id = HG::read_hypergraph_from_json('example/json/test.json', semiring, true) -#path, score = HG::viterbi_path hypergraph, nodes_by_id[-1], semiring -#s = HG::derive path, path.last.rule.lhs, [] -#puts "#{s.map { |i| i.word }.join ' '} ||| #{score}" +path, score = HG::viterbi_path hypergraph, nodes_by_id[-1], semiring +s = HG::derive path, path.last.head, [] +puts "#{s.map { |i| i.word }.join ' '}" +puts hypergraph.reset paths = HG::all_paths hypergraph, nodes_by_id[-1] paths.each { |p| - s = HG::derive p, p.last.rule.lhs, [] + s = HG::derive p, p.last.head, [] puts "#{s.map { |i| i.word }.join ' '}" } diff --git a/util/cdec_hg_to_json.py b/util/cdec_hg_to_json.py index 4e407c8..2fcc409 100755 --- a/util/cdec_hg_to_json.py +++ b/util/cdec_hg_to_json.py @@ -13,15 +13,16 @@ def hg2json(hg, weights): res += '"weights":{'+"\n" a = [] for i in weights: - a.append( '"%s":%s'%(i[0], i[1]) ) + if i[1] != 0: + a.append( '"%s":%s'%(i[0], i[1]) ) res += ", ".join(a)+"\n" res += "},\n" res += '"nodes":'+"\n" res += "[\n" a = [] - a.append( '{ "label":"root", "cat":"root" }' ) + a.append( '{ "id":-1, "cat":"root", "span":[-1,-1] }' ) for i in hg.nodes: - a.append( '{ "label":"%s", "cat":"%s", "left":%d, "right":%d }'%(i.id, i.cat, i.span[0], i.span[1]) ) + a.append('{ "id":%d, "cat":"%s", "span":[%d,%d] }'%(i.id, i.cat, i.span[0], i.span[1])) res += ",\n".join(a)+"\n" res += "],\n" res += '"edges":'+"\n" @@ -29,31 +30,20 @@ def hg2json(hg, weights): a = [] for i in hg.edges: s = "{" - s += '"head":"%s"'%(i.head_node.id) + s += '"head":%d'%(i.head_node.id) s += ', "rule":"%s"'%(i.trule) - s += ', "left":%d'%(i.span[0]) - s += ', "right":%d'%(i.span[1]) - #s += ', "leftx":%d'%(i.src_span[0]) - #s += ', "rightx":%d'%(i.src_span[1]) - s += ', "spans":"' - q = 0 - for z in i.tail_nodes: - s+= "%s|||%d|||(%d,%d);"%(z.cat, q, z.span[0], z.span[1]) - q += 1 - s += '"' + # f xs = ' "f":{' b = [] for j in i.feature_values: b.append( '"%s":%s'%(j[0], j[1]) ) xs += ", ".join(b) xs += "}," - c = [] - for j in i.tail_nodes: - c.append( '"'+str(j.id)+'"' ) - if len(c) > 0: - s += ', "tails":[ %s ],'%(",".join(c)) + # tails + if len(list(i.tail_nodes)) > 0: + s += ', "tails":[ %s ],'%(",".join([str(n.id) for n in i.tail_nodes])) else: - s += ', "tails":[ "root" ],' + s += ', "tails":[ -1 ],' s += xs s += ' "weight":%s }'%(i.prob) a.append(s) -- cgit v1.2.3